aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-04 18:31:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-04 18:31:36 -0400
commit53c566625fb872e7826a237f0f5c21458028e94a (patch)
tree8ef9990ed2124f085442bc5a44c3f5212bf4002d
parent34917f9713905a937816ebb7ee5f25bef7a6441c (diff)
parent00fdf13a2e9f313a044288aa59d3b8ec29ff904a (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs changes from Chris Mason: "This is a pretty long stream of bug fixes and performance fixes. Qu Wenruo has replaced the btrfs async threads with regular kernel workqueues. We'll keep an eye out for performance differences, but it's nice to be using more generic code for this. We still have some corruption fixes and other patches coming in for the merge window, but this batch is tested and ready to go" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (108 commits) Btrfs: fix a crash of clone with inline extents's split btrfs: fix uninit variable warning Btrfs: take into account total references when doing backref lookup Btrfs: part 2, fix incremental send's decision to delay a dir move/rename Btrfs: fix incremental send's decision to delay a dir move/rename Btrfs: remove unnecessary inode generation lookup in send Btrfs: fix race when updating existing ref head btrfs: Add trace for btrfs_workqueue alloc/destroy Btrfs: less fs tree lock contention when using autodefrag Btrfs: return EPERM when deleting a default subvolume Btrfs: add missing kfree in btrfs_destroy_workqueue Btrfs: cache extent states in defrag code path Btrfs: fix deadlock with nested trans handles Btrfs: fix possible empty list access when flushing the delalloc inodes Btrfs: split the global ordered extents mutex Btrfs: don't flush all delalloc inodes when we doesn't get s_umount lock Btrfs: reclaim delalloc metadata more aggressively Btrfs: remove unnecessary lock in may_commit_transaction() Btrfs: remove the unnecessary flush when preparing the pages Btrfs: just do dirty page flush for the inode with compression before direct IO ...
-rw-r--r--fs/btrfs/async-thread.c848
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c84
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/ctree.c11
-rw-r--r--fs/btrfs/ctree.h73
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/dev-replace.c79
-rw-r--r--fs/btrfs/disk-io.c281
-rw-r--r--fs/btrfs/extent-tree.c58
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/extent_map.c56
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c158
-rw-r--r--fs/btrfs/inode.c121
-rw-r--r--fs/btrfs/ioctl.c210
-rw-r--r--fs/btrfs/ordered-data.c68
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c15
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c97
-rw-r--r--fs/btrfs/send.c821
-rw-r--r--fs/btrfs/super.c37
-rw-r--r--fs/btrfs/sysfs.c33
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/transaction.c39
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c46
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--include/trace/events/btrfs.h137
35 files changed, 2137 insertions, 1626 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..ecb5832c0967 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -21,708 +22,313 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
24#include "async-thread.h" 26#include "async-thread.h"
27#include "ctree.h"
28
29#define WORK_DONE_BIT 0
30#define WORK_ORDER_DONE_BIT 1
31#define WORK_HIGH_PRIO_BIT 2
32
33#define NO_THRESHOLD (-1)
34#define DFT_THRESHOLD (32)
35
36struct __btrfs_workqueue {
37 struct workqueue_struct *normal_wq;
38 /* List head pointing to ordered work list */
39 struct list_head ordered_list;
40
41 /* Spinlock for ordered_list */
42 spinlock_t list_lock;
43
44 /* Thresholding related variants */
45 atomic_t pending;
46 int max_active;
47 int current_max;
48 int thresh;
49 unsigned int count;
50 spinlock_t thres_lock;
51};
25 52
26#define WORK_QUEUED_BIT 0 53struct btrfs_workqueue {
27#define WORK_DONE_BIT 1 54 struct __btrfs_workqueue *normal;
28#define WORK_ORDER_DONE_BIT 2 55 struct __btrfs_workqueue *high;
29#define WORK_HIGH_PRIO_BIT 3 56};
30
31/*
32 * container for the kthread task pointer and the list of pending work
33 * One of these is allocated per thread.
34 */
35struct btrfs_worker_thread {
36 /* pool we belong to */
37 struct btrfs_workers *workers;
38
39 /* list of struct btrfs_work that are waiting for service */
40 struct list_head pending;
41 struct list_head prio_pending;
42
43 /* list of worker threads from struct btrfs_workers */
44 struct list_head worker_list;
45
46 /* kthread */
47 struct task_struct *task;
48 57
49 /* number of things on the pending list */ 58static inline struct __btrfs_workqueue
50 atomic_t num_pending; 59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh)
61{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
51 63
52 /* reference counter for this struct */ 64 if (unlikely(!ret))
53 atomic_t refs; 65 return NULL;
54 66
55 unsigned long sequence; 67 ret->max_active = max_active;
68 atomic_set(&ret->pending, 0);
69 if (thresh == 0)
70 thresh = DFT_THRESHOLD;
71 /* For low threshold, disabling threshold is a better choice */
72 if (thresh < DFT_THRESHOLD) {
73 ret->current_max = max_active;
74 ret->thresh = NO_THRESHOLD;
75 } else {
76 ret->current_max = 1;
77 ret->thresh = thresh;
78 }
56 79
57 /* protects the pending list. */ 80 if (flags & WQ_HIGHPRI)
58 spinlock_t lock; 81 ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
82 ret->max_active,
83 "btrfs", name);
84 else
85 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs",
87 name);
88 if (unlikely(!ret->normal_wq)) {
89 kfree(ret);
90 return NULL;
91 }
59 92
60 /* set to non-zero when this thread is already awake and kicking */ 93 INIT_LIST_HEAD(&ret->ordered_list);
61 int working; 94 spin_lock_init(&ret->list_lock);
95 spin_lock_init(&ret->thres_lock);
96 trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
97 return ret;
98}
62 99
63 /* are we currently idle */ 100static inline void
64 int idle; 101__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
65};
66 102
67static int __btrfs_start_workers(struct btrfs_workers *workers); 103struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
104 int flags,
105 int max_active,
106 int thresh)
107{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
68 109
69/* 110 if (unlikely(!ret))
70 * btrfs_start_workers uses kthread_run, which can block waiting for memory 111 return NULL;
71 * for a very long time. It will actually throttle on page writeback,
72 * and so it may not make progress until after our btrfs worker threads
73 * process all of the pending work structs in their queue
74 *
75 * This means we can't use btrfs_start_workers from inside a btrfs worker
76 * thread that is used as part of cleaning dirty memory, which pretty much
77 * involves all of the worker threads.
78 *
79 * Instead we have a helper queue who never has more than one thread
80 * where we scheduler thread start operations. This worker_start struct
81 * is used to contain the work and hold a pointer to the queue that needs
82 * another worker.
83 */
84struct worker_start {
85 struct btrfs_work work;
86 struct btrfs_workers *queue;
87};
88 112
89static void start_new_worker_func(struct btrfs_work *work) 113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
90{ 114 max_active, thresh);
91 struct worker_start *start; 115 if (unlikely(!ret->normal)) {
92 start = container_of(work, struct worker_start, work); 116 kfree(ret);
93 __btrfs_start_workers(start->queue); 117 return NULL;
94 kfree(start); 118 }
95}
96 119
97/* 120 if (flags & WQ_HIGHPRI) {
98 * helper function to move a thread onto the idle list after it 121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
99 * has finished some requests. 122 thresh);
100 */ 123 if (unlikely(!ret->high)) {
101static void check_idle_worker(struct btrfs_worker_thread *worker) 124 __btrfs_destroy_workqueue(ret->normal);
102{ 125 kfree(ret);
103 if (!worker->idle && atomic_read(&worker->num_pending) < 126 return NULL;
104 worker->workers->idle_thresh / 2) {
105 unsigned long flags;
106 spin_lock_irqsave(&worker->workers->lock, flags);
107 worker->idle = 1;
108
109 /* the list may be empty if the worker is just starting */
110 if (!list_empty(&worker->worker_list) &&
111 !worker->workers->stopping) {
112 list_move(&worker->worker_list,
113 &worker->workers->idle_list);
114 } 127 }
115 spin_unlock_irqrestore(&worker->workers->lock, flags);
116 } 128 }
129 return ret;
117} 130}
118 131
119/* 132/*
120 * helper function to move a thread off the idle list after new 133 * Hook for threshold which will be called in btrfs_queue_work.
121 * pending work is added. 134 * This hook WILL be called in IRQ handler context,
135 * so workqueue_set_max_active MUST NOT be called in this hook
122 */ 136 */
123static void check_busy_worker(struct btrfs_worker_thread *worker) 137static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
124{ 138{
125 if (worker->idle && atomic_read(&worker->num_pending) >= 139 if (wq->thresh == NO_THRESHOLD)
126 worker->workers->idle_thresh) { 140 return;
127 unsigned long flags; 141 atomic_inc(&wq->pending);
128 spin_lock_irqsave(&worker->workers->lock, flags);
129 worker->idle = 0;
130
131 if (!list_empty(&worker->worker_list) &&
132 !worker->workers->stopping) {
133 list_move_tail(&worker->worker_list,
134 &worker->workers->worker_list);
135 }
136 spin_unlock_irqrestore(&worker->workers->lock, flags);
137 }
138} 142}
139 143
140static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 144/*
145 * Hook for threshold which will be called before executing the work,
146 * This hook is called in kthread content.
147 * So workqueue_set_max_active is called here.
148 */
149static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
141{ 150{
142 struct btrfs_workers *workers = worker->workers; 151 int new_max_active;
143 struct worker_start *start; 152 long pending;
144 unsigned long flags; 153 int need_change = 0;
145 154
146 rmb(); 155 if (wq->thresh == NO_THRESHOLD)
147 if (!workers->atomic_start_pending)
148 return; 156 return;
149 157
150 start = kzalloc(sizeof(*start), GFP_NOFS); 158 atomic_dec(&wq->pending);
151 if (!start) 159 spin_lock(&wq->thres_lock);
152 return; 160 /*
153 161 * Use wq->count to limit the calling frequency of
154 start->work.func = start_new_worker_func; 162 * workqueue_set_max_active.
155 start->queue = workers; 163 */
156 164 wq->count++;
157 spin_lock_irqsave(&workers->lock, flags); 165 wq->count %= (wq->thresh / 4);
158 if (!workers->atomic_start_pending) 166 if (!wq->count)
159 goto out; 167 goto out;
160 168 new_max_active = wq->current_max;
161 workers->atomic_start_pending = 0;
162 if (workers->num_workers + workers->num_workers_starting >=
163 workers->max_workers)
164 goto out;
165
166 workers->num_workers_starting += 1;
167 spin_unlock_irqrestore(&workers->lock, flags);
168 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
169 return;
170 169
170 /*
171 * pending may be changed later, but it's OK since we really
172 * don't need it so accurate to calculate new_max_active.
173 */
174 pending = atomic_read(&wq->pending);
175 if (pending > wq->thresh)
176 new_max_active++;
177 if (pending < wq->thresh / 2)
178 new_max_active--;
179 new_max_active = clamp_val(new_max_active, 1, wq->max_active);
180 if (new_max_active != wq->current_max) {
181 need_change = 1;
182 wq->current_max = new_max_active;
183 }
171out: 184out:
172 kfree(start); 185 spin_unlock(&wq->thres_lock);
173 spin_unlock_irqrestore(&workers->lock, flags); 186
187 if (need_change) {
188 workqueue_set_max_active(wq->normal_wq, wq->current_max);
189 }
174} 190}
175 191
176static noinline void run_ordered_completions(struct btrfs_workers *workers, 192static void run_ordered_work(struct __btrfs_workqueue *wq)
177 struct btrfs_work *work)
178{ 193{
179 if (!workers->ordered) 194 struct list_head *list = &wq->ordered_list;
180 return; 195 struct btrfs_work *work;
181 196 spinlock_t *lock = &wq->list_lock;
182 set_bit(WORK_DONE_BIT, &work->flags); 197 unsigned long flags;
183
184 spin_lock(&workers->order_lock);
185 198
186 while (1) { 199 while (1) {
187 if (!list_empty(&workers->prio_order_list)) { 200 spin_lock_irqsave(lock, flags);
188 work = list_entry(workers->prio_order_list.next, 201 if (list_empty(list))
189 struct btrfs_work, order_list);
190 } else if (!list_empty(&workers->order_list)) {
191 work = list_entry(workers->order_list.next,
192 struct btrfs_work, order_list);
193 } else {
194 break; 202 break;
195 } 203 work = list_entry(list->next, struct btrfs_work,
204 ordered_list);
196 if (!test_bit(WORK_DONE_BIT, &work->flags)) 205 if (!test_bit(WORK_DONE_BIT, &work->flags))
197 break; 206 break;
198 207
199 /* we are going to call the ordered done function, but 208 /*
209 * we are going to call the ordered done function, but
200 * we leave the work item on the list as a barrier so 210 * we leave the work item on the list as a barrier so
201 * that later work items that are done don't have their 211 * that later work items that are done don't have their
202 * functions called before this one returns 212 * functions called before this one returns
203 */ 213 */
204 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 214 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
205 break; 215 break;
206 216 trace_btrfs_ordered_sched(work);
207 spin_unlock(&workers->order_lock); 217 spin_unlock_irqrestore(lock, flags);
208
209 work->ordered_func(work); 218 work->ordered_func(work);
210 219
211 /* now take the lock again and drop our item from the list */ 220 /* now take the lock again and drop our item from the list */
212 spin_lock(&workers->order_lock); 221 spin_lock_irqsave(lock, flags);
213 list_del(&work->order_list); 222 list_del(&work->ordered_list);
214 spin_unlock(&workers->order_lock); 223 spin_unlock_irqrestore(lock, flags);
215 224
216 /* 225 /*
217 * we don't want to call the ordered free functions 226 * we don't want to call the ordered free functions
218 * with the lock held though 227 * with the lock held though
219 */ 228 */
220 work->ordered_free(work); 229 work->ordered_free(work);
221 spin_lock(&workers->order_lock); 230 trace_btrfs_all_work_done(work);
222 }
223
224 spin_unlock(&workers->order_lock);
225}
226
227static void put_worker(struct btrfs_worker_thread *worker)
228{
229 if (atomic_dec_and_test(&worker->refs))
230 kfree(worker);
231}
232
233static int try_worker_shutdown(struct btrfs_worker_thread *worker)
234{
235 int freeit = 0;
236
237 spin_lock_irq(&worker->lock);
238 spin_lock(&worker->workers->lock);
239 if (worker->workers->num_workers > 1 &&
240 worker->idle &&
241 !worker->working &&
242 !list_empty(&worker->worker_list) &&
243 list_empty(&worker->prio_pending) &&
244 list_empty(&worker->pending) &&
245 atomic_read(&worker->num_pending) == 0) {
246 freeit = 1;
247 list_del_init(&worker->worker_list);
248 worker->workers->num_workers--;
249 } 231 }
250 spin_unlock(&worker->workers->lock); 232 spin_unlock_irqrestore(lock, flags);
251 spin_unlock_irq(&worker->lock);
252
253 if (freeit)
254 put_worker(worker);
255 return freeit;
256} 233}
257 234
258static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, 235static void normal_work_helper(struct work_struct *arg)
259 struct list_head *prio_head,
260 struct list_head *head)
261{
262 struct btrfs_work *work = NULL;
263 struct list_head *cur = NULL;
264
265 if (!list_empty(prio_head))
266 cur = prio_head->next;
267
268 smp_mb();
269 if (!list_empty(&worker->prio_pending))
270 goto refill;
271
272 if (!list_empty(head))
273 cur = head->next;
274
275 if (cur)
276 goto out;
277
278refill:
279 spin_lock_irq(&worker->lock);
280 list_splice_tail_init(&worker->prio_pending, prio_head);
281 list_splice_tail_init(&worker->pending, head);
282
283 if (!list_empty(prio_head))
284 cur = prio_head->next;
285 else if (!list_empty(head))
286 cur = head->next;
287 spin_unlock_irq(&worker->lock);
288
289 if (!cur)
290 goto out_fail;
291
292out:
293 work = list_entry(cur, struct btrfs_work, list);
294
295out_fail:
296 return work;
297}
298
299/*
300 * main loop for servicing work items
301 */
302static int worker_loop(void *arg)
303{ 236{
304 struct btrfs_worker_thread *worker = arg;
305 struct list_head head;
306 struct list_head prio_head;
307 struct btrfs_work *work; 237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq;
239 int need_order = 0;
308 240
309 INIT_LIST_HEAD(&head); 241 work = container_of(arg, struct btrfs_work, normal_work);
310 INIT_LIST_HEAD(&prio_head); 242 /*
311 243 * We should not touch things inside work in the following cases:
312 do { 244 * 1) after work->func() if it has no ordered_free
313again: 245 * Since the struct is freed in work->func().
314 while (1) { 246 * 2) after setting WORK_DONE_BIT
315 247 * The work may be freed in other threads almost instantly.
316 248 * So we save the needed things here.
317 work = get_next_work(worker, &prio_head, &head); 249 */
318 if (!work) 250 if (work->ordered_func)
319 break; 251 need_order = 1;
320 252 wq = work->wq;
321 list_del(&work->list); 253
322 clear_bit(WORK_QUEUED_BIT, &work->flags); 254 trace_btrfs_work_sched(work);
323 255 thresh_exec_hook(wq);
324 work->worker = worker; 256 work->func(work);
325 257 if (need_order) {
326 work->func(work); 258 set_bit(WORK_DONE_BIT, &work->flags);
327 259 run_ordered_work(wq);
328 atomic_dec(&worker->num_pending);
329 /*
330 * unless this is an ordered work queue,
331 * 'work' was probably freed by func above.
332 */
333 run_ordered_completions(worker->workers, work);
334
335 check_pending_worker_creates(worker);
336 cond_resched();
337 }
338
339 spin_lock_irq(&worker->lock);
340 check_idle_worker(worker);
341
342 if (freezing(current)) {
343 worker->working = 0;
344 spin_unlock_irq(&worker->lock);
345 try_to_freeze();
346 } else {
347 spin_unlock_irq(&worker->lock);
348 if (!kthread_should_stop()) {
349 cpu_relax();
350 /*
351 * we've dropped the lock, did someone else
352 * jump_in?
353 */
354 smp_mb();
355 if (!list_empty(&worker->pending) ||
356 !list_empty(&worker->prio_pending))
357 continue;
358
359 /*
360 * this short schedule allows more work to
361 * come in without the queue functions
362 * needing to go through wake_up_process()
363 *
364 * worker->working is still 1, so nobody
365 * is going to try and wake us up
366 */
367 schedule_timeout(1);
368 smp_mb();
369 if (!list_empty(&worker->pending) ||
370 !list_empty(&worker->prio_pending))
371 continue;
372
373 if (kthread_should_stop())
374 break;
375
376 /* still no more work?, sleep for real */
377 spin_lock_irq(&worker->lock);
378 set_current_state(TASK_INTERRUPTIBLE);
379 if (!list_empty(&worker->pending) ||
380 !list_empty(&worker->prio_pending)) {
381 spin_unlock_irq(&worker->lock);
382 set_current_state(TASK_RUNNING);
383 goto again;
384 }
385
386 /*
387 * this makes sure we get a wakeup when someone
388 * adds something new to the queue
389 */
390 worker->working = 0;
391 spin_unlock_irq(&worker->lock);
392
393 if (!kthread_should_stop()) {
394 schedule_timeout(HZ * 120);
395 if (!worker->working &&
396 try_worker_shutdown(worker)) {
397 return 0;
398 }
399 }
400 }
401 __set_current_state(TASK_RUNNING);
402 }
403 } while (!kthread_should_stop());
404 return 0;
405}
406
407/*
408 * this will wait for all the worker threads to shutdown
409 */
410void btrfs_stop_workers(struct btrfs_workers *workers)
411{
412 struct list_head *cur;
413 struct btrfs_worker_thread *worker;
414 int can_stop;
415
416 spin_lock_irq(&workers->lock);
417 workers->stopping = 1;
418 list_splice_init(&workers->idle_list, &workers->worker_list);
419 while (!list_empty(&workers->worker_list)) {
420 cur = workers->worker_list.next;
421 worker = list_entry(cur, struct btrfs_worker_thread,
422 worker_list);
423
424 atomic_inc(&worker->refs);
425 workers->num_workers -= 1;
426 if (!list_empty(&worker->worker_list)) {
427 list_del_init(&worker->worker_list);
428 put_worker(worker);
429 can_stop = 1;
430 } else
431 can_stop = 0;
432 spin_unlock_irq(&workers->lock);
433 if (can_stop)
434 kthread_stop(worker->task);
435 spin_lock_irq(&workers->lock);
436 put_worker(worker);
437 } 260 }
438 spin_unlock_irq(&workers->lock); 261 if (!need_order)
262 trace_btrfs_all_work_done(work);
439} 263}
440 264
441/* 265void btrfs_init_work(struct btrfs_work *work,
442 * simple init on struct btrfs_workers 266 btrfs_func_t func,
443 */ 267 btrfs_func_t ordered_func,
444void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 268 btrfs_func_t ordered_free)
445 struct btrfs_workers *async_helper)
446{ 269{
447 workers->num_workers = 0; 270 work->func = func;
448 workers->num_workers_starting = 0; 271 work->ordered_func = ordered_func;
449 INIT_LIST_HEAD(&workers->worker_list); 272 work->ordered_free = ordered_free;
450 INIT_LIST_HEAD(&workers->idle_list); 273 INIT_WORK(&work->normal_work, normal_work_helper);
451 INIT_LIST_HEAD(&workers->order_list); 274 INIT_LIST_HEAD(&work->ordered_list);
452 INIT_LIST_HEAD(&workers->prio_order_list); 275 work->flags = 0;
453 spin_lock_init(&workers->lock);
454 spin_lock_init(&workers->order_lock);
455 workers->max_workers = max;
456 workers->idle_thresh = 32;
457 workers->name = name;
458 workers->ordered = 0;
459 workers->atomic_start_pending = 0;
460 workers->atomic_worker_start = async_helper;
461 workers->stopping = 0;
462} 276}
463 277
464/* 278static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
465 * starts new worker threads. This does not enforce the max worker 279 struct btrfs_work *work)
466 * count in case you need to temporarily go past it.
467 */
468static int __btrfs_start_workers(struct btrfs_workers *workers)
469{ 280{
470 struct btrfs_worker_thread *worker; 281 unsigned long flags;
471 int ret = 0;
472
473 worker = kzalloc(sizeof(*worker), GFP_NOFS);
474 if (!worker) {
475 ret = -ENOMEM;
476 goto fail;
477 }
478
479 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock);
483
484 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1);
486 worker->workers = workers;
487 worker->task = kthread_create(worker_loop, worker,
488 "btrfs-%s-%d", workers->name,
489 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task);
492 goto fail;
493 }
494 282
495 spin_lock_irq(&workers->lock); 283 work->wq = wq;
496 if (workers->stopping) { 284 thresh_queue_hook(wq);
497 spin_unlock_irq(&workers->lock); 285 if (work->ordered_func) {
498 ret = -EINVAL; 286 spin_lock_irqsave(&wq->list_lock, flags);
499 goto fail_kthread; 287 list_add_tail(&work->ordered_list, &wq->ordered_list);
288 spin_unlock_irqrestore(&wq->list_lock, flags);
500 } 289 }
501 list_add_tail(&worker->worker_list, &workers->idle_list); 290 queue_work(wq->normal_wq, &work->normal_work);
502 worker->idle = 1; 291 trace_btrfs_work_queued(work);
503 workers->num_workers++;
504 workers->num_workers_starting--;
505 WARN_ON(workers->num_workers_starting < 0);
506 spin_unlock_irq(&workers->lock);
507
508 wake_up_process(worker->task);
509 return 0;
510
511fail_kthread:
512 kthread_stop(worker->task);
513fail:
514 kfree(worker);
515 spin_lock_irq(&workers->lock);
516 workers->num_workers_starting--;
517 spin_unlock_irq(&workers->lock);
518 return ret;
519} 292}
520 293
521int btrfs_start_workers(struct btrfs_workers *workers) 294void btrfs_queue_work(struct btrfs_workqueue *wq,
295 struct btrfs_work *work)
522{ 296{
523 spin_lock_irq(&workers->lock); 297 struct __btrfs_workqueue *dest_wq;
524 workers->num_workers_starting++;
525 spin_unlock_irq(&workers->lock);
526 return __btrfs_start_workers(workers);
527}
528
529/*
530 * run through the list and find a worker thread that doesn't have a lot
531 * to do right now. This can return null if we aren't yet at the thread
532 * count limit and all of the threads are busy.
533 */
534static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
535{
536 struct btrfs_worker_thread *worker;
537 struct list_head *next;
538 int enforce_min;
539
540 enforce_min = (workers->num_workers + workers->num_workers_starting) <
541 workers->max_workers;
542
543 /*
544 * if we find an idle thread, don't move it to the end of the
545 * idle list. This improves the chance that the next submission
546 * will reuse the same thread, and maybe catch it while it is still
547 * working
548 */
549 if (!list_empty(&workers->idle_list)) {
550 next = workers->idle_list.next;
551 worker = list_entry(next, struct btrfs_worker_thread,
552 worker_list);
553 return worker;
554 }
555 if (enforce_min || list_empty(&workers->worker_list))
556 return NULL;
557
558 /*
559 * if we pick a busy task, move the task to the end of the list.
560 * hopefully this will keep things somewhat evenly balanced.
561 * Do the move in batches based on the sequence number. This groups
562 * requests submitted at roughly the same time onto the same worker.
563 */
564 next = workers->worker_list.next;
565 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
566 worker->sequence++;
567 298
568 if (worker->sequence % workers->idle_thresh == 0) 299 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
569 list_move_tail(next, &workers->worker_list); 300 dest_wq = wq->high;
570 return worker; 301 else
302 dest_wq = wq->normal;
303 __btrfs_queue_work(dest_wq, work);
571} 304}
572 305
573/* 306static inline void
574 * selects a worker thread to take the next job. This will either find 307__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
575 * an idle worker, start a new worker up to the max count, or just return
576 * one of the existing busy workers.
577 */
578static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
579{ 308{
580 struct btrfs_worker_thread *worker; 309 destroy_workqueue(wq->normal_wq);
581 unsigned long flags; 310 trace_btrfs_workqueue_destroy(wq);
582 struct list_head *fallback; 311 kfree(wq);
583 int ret;
584
585 spin_lock_irqsave(&workers->lock, flags);
586again:
587 worker = next_worker(workers);
588
589 if (!worker) {
590 if (workers->num_workers + workers->num_workers_starting >=
591 workers->max_workers) {
592 goto fallback;
593 } else if (workers->atomic_worker_start) {
594 workers->atomic_start_pending = 1;
595 goto fallback;
596 } else {
597 workers->num_workers_starting++;
598 spin_unlock_irqrestore(&workers->lock, flags);
599 /* we're below the limit, start another worker */
600 ret = __btrfs_start_workers(workers);
601 spin_lock_irqsave(&workers->lock, flags);
602 if (ret)
603 goto fallback;
604 goto again;
605 }
606 }
607 goto found;
608
609fallback:
610 fallback = NULL;
611 /*
612 * we have failed to find any workers, just
613 * return the first one we can find.
614 */
615 if (!list_empty(&workers->worker_list))
616 fallback = workers->worker_list.next;
617 if (!list_empty(&workers->idle_list))
618 fallback = workers->idle_list.next;
619 BUG_ON(!fallback);
620 worker = list_entry(fallback,
621 struct btrfs_worker_thread, worker_list);
622found:
623 /*
624 * this makes sure the worker doesn't exit before it is placed
625 * onto a busy/idle list
626 */
627 atomic_inc(&worker->num_pending);
628 spin_unlock_irqrestore(&workers->lock, flags);
629 return worker;
630} 312}
631 313
632/* 314void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
633 * btrfs_requeue_work just puts the work item back on the tail of the list
634 * it was taken from. It is intended for use with long running work functions
635 * that make some progress and want to give the cpu up for others.
636 */
637void btrfs_requeue_work(struct btrfs_work *work)
638{ 315{
639 struct btrfs_worker_thread *worker = work->worker; 316 if (!wq)
640 unsigned long flags;
641 int wake = 0;
642
643 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
644 return; 317 return;
645 318 if (wq->high)
646 spin_lock_irqsave(&worker->lock, flags); 319 __btrfs_destroy_workqueue(wq->high);
647 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 320 __btrfs_destroy_workqueue(wq->normal);
648 list_add_tail(&work->list, &worker->prio_pending); 321 kfree(wq);
649 else
650 list_add_tail(&work->list, &worker->pending);
651 atomic_inc(&worker->num_pending);
652
653 /* by definition we're busy, take ourselves off the idle
654 * list
655 */
656 if (worker->idle) {
657 spin_lock(&worker->workers->lock);
658 worker->idle = 0;
659 list_move_tail(&worker->worker_list,
660 &worker->workers->worker_list);
661 spin_unlock(&worker->workers->lock);
662 }
663 if (!worker->working) {
664 wake = 1;
665 worker->working = 1;
666 }
667
668 if (wake)
669 wake_up_process(worker->task);
670 spin_unlock_irqrestore(&worker->lock, flags);
671} 322}
672 323
673void btrfs_set_work_high_prio(struct btrfs_work *work) 324void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
674{ 325{
675 set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 326 wq->normal->max_active = max;
327 if (wq->high)
328 wq->high->max_active = max;
676} 329}
677 330
678/* 331void btrfs_set_work_high_priority(struct btrfs_work *work)
679 * places a struct btrfs_work into the pending queue of one of the kthreads
680 */
681void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
682{ 332{
683 struct btrfs_worker_thread *worker; 333 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
684 unsigned long flags;
685 int wake = 0;
686
687 /* don't requeue something already on a list */
688 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
689 return;
690
691 worker = find_worker(workers);
692 if (workers->ordered) {
693 /*
694 * you're not allowed to do ordered queues from an
695 * interrupt handler
696 */
697 spin_lock(&workers->order_lock);
698 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
699 list_add_tail(&work->order_list,
700 &workers->prio_order_list);
701 } else {
702 list_add_tail(&work->order_list, &workers->order_list);
703 }
704 spin_unlock(&workers->order_lock);
705 } else {
706 INIT_LIST_HEAD(&work->order_list);
707 }
708
709 spin_lock_irqsave(&worker->lock, flags);
710
711 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
712 list_add_tail(&work->list, &worker->prio_pending);
713 else
714 list_add_tail(&work->list, &worker->pending);
715 check_busy_worker(worker);
716
717 /*
718 * avoid calling into wake_up_process if this thread has already
719 * been kicked
720 */
721 if (!worker->working)
722 wake = 1;
723 worker->working = 1;
724
725 if (wake)
726 wake_up_process(worker->task);
727 spin_unlock_irqrestore(&worker->lock, flags);
728} 334}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
19#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
21 22
22struct btrfs_worker_thread; 23struct btrfs_workqueue;
24/* Internal use only */
25struct __btrfs_workqueue;
26struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg);
23 28
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work { 29struct btrfs_work {
39 /* 30 btrfs_func_t func;
40 * func should be set to the function you want called 31 btrfs_func_t ordered_func;
41 * your work struct is passed as the only arg 32 btrfs_func_t ordered_free;
42 * 33
43 * ordered_func must be set for work sent to an ordered work queue, 34 /* Don't touch things below */
44 * and it is called to complete a given work item in the same 35 struct work_struct normal_work;
45 * order they were sent to the queue. 36 struct list_head ordered_list;
46 */ 37 struct __btrfs_workqueue *wq;
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags; 38 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 int num_workers_starting;
68
69 /* max number of workers allowed. changed by btrfs_start_workers */
70 int max_workers;
71
72 /* once a worker has this many requests or fewer, it is idle */
73 int idle_thresh;
74
75 /* force completions in the order they were queued */
76 int ordered;
77
78 /* more workers required, but in an interrupt handler */
79 int atomic_start_pending;
80
81 /*
82 * are we allowed to sleep while starting workers or are we required
83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
85 */
86 struct btrfs_workers *atomic_worker_start;
87
88 /* list with all the work threads. The workers on the idle thread
89 * may be actively servicing jobs, but they haven't yet hit the
90 * idle thresh limit above.
91 */
92 struct list_head worker_list;
93 struct list_head idle_list;
94
95 /*
96 * when operating in ordered mode, this maintains the list
97 * of work items waiting for completion
98 */
99 struct list_head order_list;
100 struct list_head prio_order_list;
101
102 /* lock for finding the next worker thread to queue on */
103 spinlock_t lock;
104
105 /* lock for the ordered lists */
106 spinlock_t order_lock;
107
108 /* extra name for this worker, used for current->name */
109 char *name;
110
111 int stopping;
112}; 39};
113 40
114void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
115int btrfs_start_workers(struct btrfs_workers *workers); 42 int flags,
116void btrfs_stop_workers(struct btrfs_workers *workers); 43 int max_active,
117void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 44 int thresh);
118 struct btrfs_workers *async_starter); 45void btrfs_init_work(struct btrfs_work *work,
119void btrfs_requeue_work(struct btrfs_work *work); 46 btrfs_func_t func,
120void btrfs_set_work_high_prio(struct btrfs_work *work); 47 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free);
49void btrfs_queue_work(struct btrfs_workqueue *wq,
50 struct btrfs_work *work);
51void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
52void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
53void btrfs_set_work_high_priority(struct btrfs_work *work);
121#endif 54#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..aad7201ad11b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
220 220
221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
222 struct ulist *parents, struct __prelim_ref *ref, 222 struct ulist *parents, struct __prelim_ref *ref,
223 int level, u64 time_seq, const u64 *extent_item_pos) 223 int level, u64 time_seq, const u64 *extent_item_pos,
224 u64 total_refs)
224{ 225{
225 int ret = 0; 226 int ret = 0;
226 int slot; 227 int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
249 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 250 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
250 ret = btrfs_next_old_leaf(root, path, time_seq); 251 ret = btrfs_next_old_leaf(root, path, time_seq);
251 252
252 while (!ret && count < ref->count) { 253 while (!ret && count < total_refs) {
253 eb = path->nodes[0]; 254 eb = path->nodes[0];
254 slot = path->slots[0]; 255 slot = path->slots[0];
255 256
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
306 struct btrfs_path *path, u64 time_seq, 307 struct btrfs_path *path, u64 time_seq,
307 struct __prelim_ref *ref, 308 struct __prelim_ref *ref,
308 struct ulist *parents, 309 struct ulist *parents,
309 const u64 *extent_item_pos) 310 const u64 *extent_item_pos, u64 total_refs)
310{ 311{
311 struct btrfs_root *root; 312 struct btrfs_root *root;
312 struct btrfs_key root_key; 313 struct btrfs_key root_key;
@@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
361 } 362 }
362 363
363 ret = add_all_parents(root, path, parents, ref, level, time_seq, 364 ret = add_all_parents(root, path, parents, ref, level, time_seq,
364 extent_item_pos); 365 extent_item_pos, total_refs);
365out: 366out:
366 path->lowest_level = 0; 367 path->lowest_level = 0;
367 btrfs_release_path(path); 368 btrfs_release_path(path);
@@ -374,7 +375,7 @@ out:
374static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 375static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 struct btrfs_path *path, u64 time_seq, 376 struct btrfs_path *path, u64 time_seq,
376 struct list_head *head, 377 struct list_head *head,
377 const u64 *extent_item_pos) 378 const u64 *extent_item_pos, u64 total_refs)
378{ 379{
379 int err; 380 int err;
380 int ret = 0; 381 int ret = 0;
@@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
400 if (ref->count == 0) 401 if (ref->count == 0)
401 continue; 402 continue;
402 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 403 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
403 parents, extent_item_pos); 404 parents, extent_item_pos,
405 total_refs);
404 /* 406 /*
405 * we can only tolerate ENOENT,otherwise,we should catch error 407 * we can only tolerate ENOENT,otherwise,we should catch error
406 * and return directly. 408 * and return directly.
@@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode)
557 * smaller or equal that seq to the list 559 * smaller or equal that seq to the list
558 */ 560 */
559static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 561static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
560 struct list_head *prefs) 562 struct list_head *prefs, u64 *total_refs)
561{ 563{
562 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 564 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
563 struct rb_node *n = &head->node.rb_node; 565 struct rb_node *n = &head->node.rb_node;
@@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
593 default: 595 default:
594 BUG_ON(1); 596 BUG_ON(1);
595 } 597 }
598 *total_refs += (node->ref_mod * sgn);
596 switch (node->type) { 599 switch (node->type) {
597 case BTRFS_TREE_BLOCK_REF_KEY: { 600 case BTRFS_TREE_BLOCK_REF_KEY: {
598 struct btrfs_delayed_tree_ref *ref; 601 struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
653 */ 656 */
654static int __add_inline_refs(struct btrfs_fs_info *fs_info, 657static int __add_inline_refs(struct btrfs_fs_info *fs_info,
655 struct btrfs_path *path, u64 bytenr, 658 struct btrfs_path *path, u64 bytenr,
656 int *info_level, struct list_head *prefs) 659 int *info_level, struct list_head *prefs,
660 u64 *total_refs)
657{ 661{
658 int ret = 0; 662 int ret = 0;
659 int slot; 663 int slot;
@@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
677 681
678 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 682 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
679 flags = btrfs_extent_flags(leaf, ei); 683 flags = btrfs_extent_flags(leaf, ei);
684 *total_refs += btrfs_extent_refs(leaf, ei);
680 btrfs_item_key_to_cpu(leaf, &found_key, slot); 685 btrfs_item_key_to_cpu(leaf, &found_key, slot);
681 686
682 ptr = (unsigned long)(ei + 1); 687 ptr = (unsigned long)(ei + 1);
@@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
859 struct list_head prefs; 864 struct list_head prefs;
860 struct __prelim_ref *ref; 865 struct __prelim_ref *ref;
861 struct extent_inode_elem *eie = NULL; 866 struct extent_inode_elem *eie = NULL;
867 u64 total_refs = 0;
862 868
863 INIT_LIST_HEAD(&prefs); 869 INIT_LIST_HEAD(&prefs);
864 INIT_LIST_HEAD(&prefs_delayed); 870 INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +879,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
873 path = btrfs_alloc_path(); 879 path = btrfs_alloc_path();
874 if (!path) 880 if (!path)
875 return -ENOMEM; 881 return -ENOMEM;
876 if (!trans) 882 if (!trans) {
877 path->search_commit_root = 1; 883 path->search_commit_root = 1;
884 path->skip_locking = 1;
885 }
878 886
879 /* 887 /*
880 * grab both a lock on the path and a lock on the delayed ref head. 888 * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +923,7 @@ again:
915 } 923 }
916 spin_unlock(&delayed_refs->lock); 924 spin_unlock(&delayed_refs->lock);
917 ret = __add_delayed_refs(head, time_seq, 925 ret = __add_delayed_refs(head, time_seq,
918 &prefs_delayed); 926 &prefs_delayed, &total_refs);
919 mutex_unlock(&head->mutex); 927 mutex_unlock(&head->mutex);
920 if (ret) 928 if (ret)
921 goto out; 929 goto out;
@@ -936,7 +944,8 @@ again:
936 (key.type == BTRFS_EXTENT_ITEM_KEY || 944 (key.type == BTRFS_EXTENT_ITEM_KEY ||
937 key.type == BTRFS_METADATA_ITEM_KEY)) { 945 key.type == BTRFS_METADATA_ITEM_KEY)) {
938 ret = __add_inline_refs(fs_info, path, bytenr, 946 ret = __add_inline_refs(fs_info, path, bytenr,
939 &info_level, &prefs); 947 &info_level, &prefs,
948 &total_refs);
940 if (ret) 949 if (ret)
941 goto out; 950 goto out;
942 ret = __add_keyed_refs(fs_info, path, bytenr, 951 ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +965,7 @@ again:
956 __merge_refs(&prefs, 1); 965 __merge_refs(&prefs, 1);
957 966
958 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 967 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
959 extent_item_pos); 968 extent_item_pos, total_refs);
960 if (ret) 969 if (ret)
961 goto out; 970 goto out;
962 971
@@ -965,7 +974,7 @@ again:
965 while (!list_empty(&prefs)) { 974 while (!list_empty(&prefs)) {
966 ref = list_first_entry(&prefs, struct __prelim_ref, list); 975 ref = list_first_entry(&prefs, struct __prelim_ref, list);
967 WARN_ON(ref->count < 0); 976 WARN_ON(ref->count < 0);
968 if (ref->count && ref->root_id && ref->parent == 0) { 977 if (roots && ref->count && ref->root_id && ref->parent == 0) {
969 /* no parent == root of tree */ 978 /* no parent == root of tree */
970 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 979 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
971 if (ret < 0) 980 if (ret < 0)
@@ -1061,22 +1070,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1061 u64 time_seq, struct ulist **leafs, 1070 u64 time_seq, struct ulist **leafs,
1062 const u64 *extent_item_pos) 1071 const u64 *extent_item_pos)
1063{ 1072{
1064 struct ulist *tmp;
1065 int ret; 1073 int ret;
1066 1074
1067 tmp = ulist_alloc(GFP_NOFS);
1068 if (!tmp)
1069 return -ENOMEM;
1070 *leafs = ulist_alloc(GFP_NOFS); 1075 *leafs = ulist_alloc(GFP_NOFS);
1071 if (!*leafs) { 1076 if (!*leafs)
1072 ulist_free(tmp);
1073 return -ENOMEM; 1077 return -ENOMEM;
1074 }
1075 1078
1076 ret = find_parent_nodes(trans, fs_info, bytenr, 1079 ret = find_parent_nodes(trans, fs_info, bytenr,
1077 time_seq, *leafs, tmp, extent_item_pos); 1080 time_seq, *leafs, NULL, extent_item_pos);
1078 ulist_free(tmp);
1079
1080 if (ret < 0 && ret != -ENOENT) { 1081 if (ret < 0 && ret != -ENOENT) {
1081 free_leaf_list(*leafs); 1082 free_leaf_list(*leafs);
1082 return ret; 1083 return ret;
@@ -1333,38 +1334,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1333 if (ret < 0) 1334 if (ret < 0)
1334 return ret; 1335 return ret;
1335 1336
1336 while (1) { 1337 ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
1337 u32 nritems; 1338 if (ret) {
1338 if (path->slots[0] == 0) { 1339 if (ret > 0)
1339 btrfs_set_path_blocking(path); 1340 ret = -ENOENT;
1340 ret = btrfs_prev_leaf(fs_info->extent_root, path); 1341 return ret;
1341 if (ret != 0) {
1342 if (ret > 0) {
1343 pr_debug("logical %llu is not within "
1344 "any extent\n", logical);
1345 ret = -ENOENT;
1346 }
1347 return ret;
1348 }
1349 } else {
1350 path->slots[0]--;
1351 }
1352 nritems = btrfs_header_nritems(path->nodes[0]);
1353 if (nritems == 0) {
1354 pr_debug("logical %llu is not within any extent\n",
1355 logical);
1356 return -ENOENT;
1357 }
1358 if (path->slots[0] == nritems)
1359 path->slots[0]--;
1360
1361 btrfs_item_key_to_cpu(path->nodes[0], found_key,
1362 path->slots[0]);
1363 if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
1364 found_key->type == BTRFS_METADATA_ITEM_KEY)
1365 break;
1366 } 1342 }
1367 1343 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1344 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1345 size = fs_info->extent_root->leafsize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1346 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
109 u64 last_trans; 109 u64 last_trans;
110 110
111 /* 111 /*
112 * log transid when this inode was last modified 112 * transid that last logged this inode
113 */ 113 */
114 u64 last_sub_trans; 114 u64 logged_trans;
115 115
116 /* 116 /*
117 * transid that last logged this inode 117 * log transid when this inode was last modified
118 */ 118 */
119 u64 logged_trans; 119 int last_sub_trans;
120
121 /* a local copy of root's last_log_commit */
122 int last_log_commit;
120 123
121 /* total number of bytes pending delalloc, used by stat to calc the 124 /* total number of bytes pending delalloc, used by stat to calc the
122 * real block usage of the file 125 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
155 /* flags field from the on disk inode */ 158 /* flags field from the on disk inode */
156 u32 flags; 159 u32 flags;
157 160
158 /* a local copy of root's last_log_commit */
159 unsigned long last_log_commit;
160
161 /* 161 /*
162 * Counters to keep track of the number of extent item's we may use due 162 * Counters to keep track of the number of extent item's we may use due
163 * to delalloc and such. outstanding_extents is the number of extent 163 * to delalloc and such. outstanding_extents is the number of extent
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..88d1b1eedc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5376 int advance_right; 5376 int advance_right;
5377 u64 left_blockptr; 5377 u64 left_blockptr;
5378 u64 right_blockptr; 5378 u64 right_blockptr;
5379 u64 left_gen;
5380 u64 right_gen;
5379 u64 left_start_ctransid; 5381 u64 left_start_ctransid;
5380 u64 right_start_ctransid; 5382 u64 right_start_ctransid;
5381 u64 ctransid; 5383 u64 ctransid;
@@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5640 right_blockptr = btrfs_node_blockptr( 5642 right_blockptr = btrfs_node_blockptr(
5641 right_path->nodes[right_level], 5643 right_path->nodes[right_level],
5642 right_path->slots[right_level]); 5644 right_path->slots[right_level]);
5643 if (left_blockptr == right_blockptr) { 5645 left_gen = btrfs_node_ptr_generation(
5646 left_path->nodes[left_level],
5647 left_path->slots[left_level]);
5648 right_gen = btrfs_node_ptr_generation(
5649 right_path->nodes[right_level],
5650 right_path->slots[right_level]);
5651 if (left_blockptr == right_blockptr &&
5652 left_gen == right_gen) {
5644 /* 5653 /*
5645 * As we're on a shared block, don't 5654 * As we're on a shared block, don't
5646 * allow to go deeper. 5655 * allow to go deeper.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..bc96c03dd259 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
351#define BTRFS_FS_STATE_ERROR 0 351#define BTRFS_FS_STATE_ERROR 0
352#define BTRFS_FS_STATE_REMOUNTING 1 352#define BTRFS_FS_STATE_REMOUNTING 1
353#define BTRFS_FS_STATE_TRANS_ABORTED 2 353#define BTRFS_FS_STATE_TRANS_ABORTED 2
354#define BTRFS_FS_STATE_DEV_REPLACING 3
354 355
355/* Super block flags */ 356/* Super block flags */
356/* Errors detected */ 357/* Errors detected */
@@ -1489,6 +1490,7 @@ struct btrfs_fs_info {
1489 */ 1490 */
1490 struct list_head ordered_roots; 1491 struct list_head ordered_roots;
1491 1492
1493 struct mutex delalloc_root_mutex;
1492 spinlock_t delalloc_root_lock; 1494 spinlock_t delalloc_root_lock;
1493 /* all fs/file tree roots that have delalloc inodes. */ 1495 /* all fs/file tree roots that have delalloc inodes. */
1494 struct list_head delalloc_roots; 1496 struct list_head delalloc_roots;
@@ -1503,28 +1505,27 @@ struct btrfs_fs_info {
1503 * A third pool does submit_bio to avoid deadlocking with the other 1505 * A third pool does submit_bio to avoid deadlocking with the other
1504 * two 1506 * two
1505 */ 1507 */
1506 struct btrfs_workers generic_worker; 1508 struct btrfs_workqueue *workers;
1507 struct btrfs_workers workers; 1509 struct btrfs_workqueue *delalloc_workers;
1508 struct btrfs_workers delalloc_workers; 1510 struct btrfs_workqueue *flush_workers;
1509 struct btrfs_workers flush_workers; 1511 struct btrfs_workqueue *endio_workers;
1510 struct btrfs_workers endio_workers; 1512 struct btrfs_workqueue *endio_meta_workers;
1511 struct btrfs_workers endio_meta_workers; 1513 struct btrfs_workqueue *endio_raid56_workers;
1512 struct btrfs_workers endio_raid56_workers; 1514 struct btrfs_workqueue *rmw_workers;
1513 struct btrfs_workers rmw_workers; 1515 struct btrfs_workqueue *endio_meta_write_workers;
1514 struct btrfs_workers endio_meta_write_workers; 1516 struct btrfs_workqueue *endio_write_workers;
1515 struct btrfs_workers endio_write_workers; 1517 struct btrfs_workqueue *endio_freespace_worker;
1516 struct btrfs_workers endio_freespace_worker; 1518 struct btrfs_workqueue *submit_workers;
1517 struct btrfs_workers submit_workers; 1519 struct btrfs_workqueue *caching_workers;
1518 struct btrfs_workers caching_workers; 1520 struct btrfs_workqueue *readahead_workers;
1519 struct btrfs_workers readahead_workers;
1520 1521
1521 /* 1522 /*
1522 * fixup workers take dirty pages that didn't properly go through 1523 * fixup workers take dirty pages that didn't properly go through
1523 * the cow mechanism and make them safe to write. It happens 1524 * the cow mechanism and make them safe to write. It happens
1524 * for the sys_munmap function call path 1525 * for the sys_munmap function call path
1525 */ 1526 */
1526 struct btrfs_workers fixup_workers; 1527 struct btrfs_workqueue *fixup_workers;
1527 struct btrfs_workers delayed_workers; 1528 struct btrfs_workqueue *delayed_workers;
1528 struct task_struct *transaction_kthread; 1529 struct task_struct *transaction_kthread;
1529 struct task_struct *cleaner_kthread; 1530 struct task_struct *cleaner_kthread;
1530 int thread_pool_size; 1531 int thread_pool_size;
@@ -1604,9 +1605,9 @@ struct btrfs_fs_info {
1604 atomic_t scrub_cancel_req; 1605 atomic_t scrub_cancel_req;
1605 wait_queue_head_t scrub_pause_wait; 1606 wait_queue_head_t scrub_pause_wait;
1606 int scrub_workers_refcnt; 1607 int scrub_workers_refcnt;
1607 struct btrfs_workers scrub_workers; 1608 struct btrfs_workqueue *scrub_workers;
1608 struct btrfs_workers scrub_wr_completion_workers; 1609 struct btrfs_workqueue *scrub_wr_completion_workers;
1609 struct btrfs_workers scrub_nocow_workers; 1610 struct btrfs_workqueue *scrub_nocow_workers;
1610 1611
1611#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1612#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1612 u32 check_integrity_print_mask; 1613 u32 check_integrity_print_mask;
@@ -1647,7 +1648,7 @@ struct btrfs_fs_info {
1647 /* qgroup rescan items */ 1648 /* qgroup rescan items */
1648 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1649 struct mutex qgroup_rescan_lock; /* protects the progress item */
1649 struct btrfs_key qgroup_rescan_progress; 1650 struct btrfs_key qgroup_rescan_progress;
1650 struct btrfs_workers qgroup_rescan_workers; 1651 struct btrfs_workqueue *qgroup_rescan_workers;
1651 struct completion qgroup_rescan_completion; 1652 struct completion qgroup_rescan_completion;
1652 struct btrfs_work qgroup_rescan_work; 1653 struct btrfs_work qgroup_rescan_work;
1653 1654
@@ -1674,10 +1675,18 @@ struct btrfs_fs_info {
1674 1675
1675 atomic_t mutually_exclusive_operation_running; 1676 atomic_t mutually_exclusive_operation_running;
1676 1677
1678 struct percpu_counter bio_counter;
1679 wait_queue_head_t replace_wait;
1680
1677 struct semaphore uuid_tree_rescan_sem; 1681 struct semaphore uuid_tree_rescan_sem;
1678 unsigned int update_uuid_tree_gen:1; 1682 unsigned int update_uuid_tree_gen:1;
1679}; 1683};
1680 1684
1685struct btrfs_subvolume_writers {
1686 struct percpu_counter counter;
1687 wait_queue_head_t wait;
1688};
1689
1681/* 1690/*
1682 * in ram representation of the tree. extent_root is used for all allocations 1691 * in ram representation of the tree. extent_root is used for all allocations
1683 * and for the extent tree extent_root root. 1692 * and for the extent tree extent_root root.
@@ -1714,11 +1723,15 @@ struct btrfs_root {
1714 struct mutex log_mutex; 1723 struct mutex log_mutex;
1715 wait_queue_head_t log_writer_wait; 1724 wait_queue_head_t log_writer_wait;
1716 wait_queue_head_t log_commit_wait[2]; 1725 wait_queue_head_t log_commit_wait[2];
1726 struct list_head log_ctxs[2];
1717 atomic_t log_writers; 1727 atomic_t log_writers;
1718 atomic_t log_commit[2]; 1728 atomic_t log_commit[2];
1719 atomic_t log_batch; 1729 atomic_t log_batch;
1720 unsigned long log_transid; 1730 int log_transid;
1721 unsigned long last_log_commit; 1731 /* No matter the commit succeeds or not*/
1732 int log_transid_committed;
1733 /* Just be updated when the commit succeeds. */
1734 int last_log_commit;
1722 pid_t log_start_pid; 1735 pid_t log_start_pid;
1723 bool log_multiple_pids; 1736 bool log_multiple_pids;
1724 1737
@@ -1793,6 +1806,7 @@ struct btrfs_root {
1793 spinlock_t root_item_lock; 1806 spinlock_t root_item_lock;
1794 atomic_t refs; 1807 atomic_t refs;
1795 1808
1809 struct mutex delalloc_mutex;
1796 spinlock_t delalloc_lock; 1810 spinlock_t delalloc_lock;
1797 /* 1811 /*
1798 * all of the inodes that have delalloc bytes. It is possible for 1812 * all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1816,8 @@ struct btrfs_root {
1802 struct list_head delalloc_inodes; 1816 struct list_head delalloc_inodes;
1803 struct list_head delalloc_root; 1817 struct list_head delalloc_root;
1804 u64 nr_delalloc_inodes; 1818 u64 nr_delalloc_inodes;
1819
1820 struct mutex ordered_extent_mutex;
1805 /* 1821 /*
1806 * this is used by the balancing code to wait for all the pending 1822 * this is used by the balancing code to wait for all the pending
1807 * ordered extents 1823 * ordered extents
@@ -1822,6 +1838,8 @@ struct btrfs_root {
1822 * manipulation with the read-only status via SUBVOL_SETFLAGS 1838 * manipulation with the read-only status via SUBVOL_SETFLAGS
1823 */ 1839 */
1824 int send_in_progress; 1840 int send_in_progress;
1841 struct btrfs_subvolume_writers *subv_writers;
1842 atomic_t will_be_snapshoted;
1825}; 1843};
1826 1844
1827struct btrfs_ioctl_defrag_range_args { 1845struct btrfs_ioctl_defrag_range_args {
@@ -3346,6 +3364,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3346int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3364int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3347 struct btrfs_fs_info *fs_info); 3365 struct btrfs_fs_info *fs_info);
3348int __get_raid_index(u64 flags); 3366int __get_raid_index(u64 flags);
3367
3368int btrfs_start_nocow_write(struct btrfs_root *root);
3369void btrfs_end_nocow_write(struct btrfs_root *root);
3349/* ctree.c */ 3370/* ctree.c */
3350int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3371int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3351 int level, int *slot); 3372 int level, int *slot);
@@ -3723,7 +3744,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3723 u32 min_type); 3744 u32 min_type);
3724 3745
3725int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3746int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3726int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); 3747int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
3748 int nr);
3727int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3749int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3728 struct extent_state **cached_state); 3750 struct extent_state **cached_state);
3729int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3751int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4027,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
4005int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 4027int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4006 struct btrfs_scrub_progress *progress); 4028 struct btrfs_scrub_progress *progress);
4007 4029
4030/* dev-replace.c */
4031void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4032void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4033void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
4034
4008/* reada.c */ 4035/* reada.c */
4009struct reada_control { 4036struct reada_control {
4010 struct btrfs_root *root; /* tree to prefetch */ 4037 struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1392 return -ENOMEM; 1392 return -ENOMEM;
1393 1393
1394 async_work->delayed_root = delayed_root; 1394 async_work->delayed_root = delayed_root;
1395 async_work->work.func = btrfs_async_run_delayed_root; 1395 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
1396 async_work->work.flags = 0; 1396 NULL, NULL);
1397 async_work->nr = nr; 1397 async_work->nr = nr;
1398 1398
1399 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); 1399 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
1400 return 0; 1400 return 0;
1401} 1401}
1402 1402
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
199 */ 199 */
200static struct btrfs_delayed_ref_head * 200static struct btrfs_delayed_ref_head *
201find_ref_head(struct rb_root *root, u64 bytenr, 201find_ref_head(struct rb_root *root, u64 bytenr,
202 struct btrfs_delayed_ref_head **last, int return_bigger) 202 int return_bigger)
203{ 203{
204 struct rb_node *n; 204 struct rb_node *n;
205 struct btrfs_delayed_ref_head *entry; 205 struct btrfs_delayed_ref_head *entry;
206 int cmp = 0;
207 206
208again:
209 n = root->rb_node; 207 n = root->rb_node;
210 entry = NULL; 208 entry = NULL;
211 while (n) { 209 while (n) {
212 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 210 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
213 if (last)
214 *last = entry;
215 211
216 if (bytenr < entry->node.bytenr) 212 if (bytenr < entry->node.bytenr)
217 cmp = -1;
218 else if (bytenr > entry->node.bytenr)
219 cmp = 1;
220 else
221 cmp = 0;
222
223 if (cmp < 0)
224 n = n->rb_left; 213 n = n->rb_left;
225 else if (cmp > 0) 214 else if (bytenr > entry->node.bytenr)
226 n = n->rb_right; 215 n = n->rb_right;
227 else 216 else
228 return entry; 217 return entry;
229 } 218 }
230 if (entry && return_bigger) { 219 if (entry && return_bigger) {
231 if (cmp > 0) { 220 if (bytenr > entry->node.bytenr) {
232 n = rb_next(&entry->href_node); 221 n = rb_next(&entry->href_node);
233 if (!n) 222 if (!n)
234 n = rb_first(root); 223 n = rb_first(root);
235 entry = rb_entry(n, struct btrfs_delayed_ref_head, 224 entry = rb_entry(n, struct btrfs_delayed_ref_head,
236 href_node); 225 href_node);
237 bytenr = entry->node.bytenr; 226 return entry;
238 return_bigger = 0;
239 goto again;
240 } 227 }
241 return entry; 228 return entry;
242 } 229 }
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
415 402
416again: 403again:
417 start = delayed_refs->run_delayed_start; 404 start = delayed_refs->run_delayed_start;
418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 405 head = find_ref_head(&delayed_refs->href_root, start, 1);
419 if (!head && !loop) { 406 if (!head && !loop) {
420 delayed_refs->run_delayed_start = 0; 407 delayed_refs->run_delayed_start = 0;
421 start = 0; 408 start = 0;
422 loop = true; 409 loop = true;
423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 410 head = find_ref_head(&delayed_refs->href_root, start, 1);
424 if (!head) 411 if (!head)
425 return NULL; 412 return NULL;
426 } else if (!head && loop) { 413 } else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
508 ref = btrfs_delayed_node_to_head(update); 495 ref = btrfs_delayed_node_to_head(update);
509 BUG_ON(existing_ref->is_data != ref->is_data); 496 BUG_ON(existing_ref->is_data != ref->is_data);
510 497
498 spin_lock(&existing_ref->lock);
511 if (ref->must_insert_reserved) { 499 if (ref->must_insert_reserved) {
512 /* if the extent was freed and then 500 /* if the extent was freed and then
513 * reallocated before the delayed ref 501 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
549 * only need the lock for this case cause we could be processing it 537 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok. 538 * currently, for refs we just added we know we're a-ok.
551 */ 539 */
552 spin_lock(&existing_ref->lock);
553 existing->ref_mod += update->ref_mod; 540 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock); 541 spin_unlock(&existing_ref->lock);
555} 542}
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
898 struct btrfs_delayed_ref_root *delayed_refs; 885 struct btrfs_delayed_ref_root *delayed_refs;
899 886
900 delayed_refs = &trans->transaction->delayed_refs; 887 delayed_refs = &trans->transaction->delayed_refs;
901 return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); 888 return find_ref_head(&delayed_refs->href_root, bytenr, 0);
902} 889}
903 890
904void btrfs_delayed_ref_exit(void) 891void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
431 return ret; 431 return ret;
432} 432}
433 433
434/*
435 * blocked until all flighting bios are finished.
436 */
437static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
438{
439 s64 writers;
440 DEFINE_WAIT(wait);
441
442 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
443 do {
444 prepare_to_wait(&fs_info->replace_wait, &wait,
445 TASK_UNINTERRUPTIBLE);
446 writers = percpu_counter_sum(&fs_info->bio_counter);
447 if (writers)
448 schedule();
449 finish_wait(&fs_info->replace_wait, &wait);
450 } while (writers);
451}
452
453/*
454 * we have removed target device, it is safe to allow new bios request.
455 */
456static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
457{
458 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
459 if (waitqueue_active(&fs_info->replace_wait))
460 wake_up(&fs_info->replace_wait);
461}
462
434static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 463static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
435 int scrub_ret) 464 int scrub_ret)
436{ 465{
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
458 src_device = dev_replace->srcdev; 487 src_device = dev_replace->srcdev;
459 btrfs_dev_replace_unlock(dev_replace); 488 btrfs_dev_replace_unlock(dev_replace);
460 489
461 /* replace old device with new one in mapping tree */
462 if (!scrub_ret)
463 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
464 src_device,
465 tgt_device);
466
467 /* 490 /*
468 * flush all outstanding I/O and inode extent mappings before the 491 * flush all outstanding I/O and inode extent mappings before the
469 * copy operation is declared as being finished 492 * copy operation is declared as being finished
470 */ 493 */
471 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 494 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
472 if (ret) { 495 if (ret) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 496 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return ret; 497 return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
484 WARN_ON(ret); 507 WARN_ON(ret);
485 508
486 /* keep away write_all_supers() during the finishing procedure */ 509 /* keep away write_all_supers() during the finishing procedure */
510 mutex_lock(&root->fs_info->chunk_mutex);
487 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 511 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
488 btrfs_dev_replace_lock(dev_replace); 512 btrfs_dev_replace_lock(dev_replace);
489 dev_replace->replace_state = 513 dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
494 dev_replace->time_stopped = get_seconds(); 518 dev_replace->time_stopped = get_seconds();
495 dev_replace->item_needs_writeback = 1; 519 dev_replace->item_needs_writeback = 1;
496 520
497 if (scrub_ret) { 521 /* replace old device with new one in mapping tree */
522 if (!scrub_ret) {
523 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
524 src_device,
525 tgt_device);
526 } else {
498 printk_in_rcu(KERN_ERR 527 printk_in_rcu(KERN_ERR
499 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 528 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
500 src_device->missing ? "<missing disk>" : 529 src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
503 rcu_str_deref(tgt_device->name), scrub_ret); 532 rcu_str_deref(tgt_device->name), scrub_ret);
504 btrfs_dev_replace_unlock(dev_replace); 533 btrfs_dev_replace_unlock(dev_replace);
505 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 534 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
535 mutex_unlock(&root->fs_info->chunk_mutex);
506 if (tgt_device) 536 if (tgt_device)
507 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 537 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
508 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 538 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 562 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
533 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
534 564
565 btrfs_rm_dev_replace_blocked(fs_info);
566
535 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 567 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
536 568
569 btrfs_rm_dev_replace_unblocked(fs_info);
570
537 /* 571 /*
538 * this is again a consistent state where no dev_replace procedure 572 * this is again a consistent state where no dev_replace procedure
539 * is running, the target device is part of the filesystem, the 573 * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
543 */ 577 */
544 btrfs_dev_replace_unlock(dev_replace); 578 btrfs_dev_replace_unlock(dev_replace);
545 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 579 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
580 mutex_unlock(&root->fs_info->chunk_mutex);
546 581
547 /* write back the superblocks */ 582 /* write back the superblocks */
548 trans = btrfs_start_transaction(root, 0); 583 trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
862 mutex_unlock(&dev_replace->lock_management_lock); 897 mutex_unlock(&dev_replace->lock_management_lock);
863 } 898 }
864} 899}
900
901void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
902{
903 percpu_counter_inc(&fs_info->bio_counter);
904}
905
906void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
907{
908 percpu_counter_dec(&fs_info->bio_counter);
909
910 if (waitqueue_active(&fs_info->replace_wait))
911 wake_up(&fs_info->replace_wait);
912}
913
914void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
915{
916 DEFINE_WAIT(wait);
917again:
918 percpu_counter_inc(&fs_info->bio_counter);
919 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
920 btrfs_bio_counter_dec(fs_info);
921 wait_event(fs_info->replace_wait,
922 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
923 &fs_info->fs_state));
924 goto again;
925 }
926
927}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..bd0f752b797b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
678 678
679 fs_info = end_io_wq->info; 679 fs_info = end_io_wq->info;
680 end_io_wq->error = err; 680 end_io_wq->error = err;
681 end_io_wq->work.func = end_workqueue_fn; 681 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
682 end_io_wq->work.flags = 0;
683 682
684 if (bio->bi_rw & REQ_WRITE) { 683 if (bio->bi_rw & REQ_WRITE) {
685 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 684 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
686 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 685 btrfs_queue_work(fs_info->endio_meta_write_workers,
687 &end_io_wq->work); 686 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 687 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
689 btrfs_queue_worker(&fs_info->endio_freespace_worker, 688 btrfs_queue_work(fs_info->endio_freespace_worker,
690 &end_io_wq->work); 689 &end_io_wq->work);
691 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 690 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
692 btrfs_queue_worker(&fs_info->endio_raid56_workers, 691 btrfs_queue_work(fs_info->endio_raid56_workers,
693 &end_io_wq->work); 692 &end_io_wq->work);
694 else 693 else
695 btrfs_queue_worker(&fs_info->endio_write_workers, 694 btrfs_queue_work(fs_info->endio_write_workers,
696 &end_io_wq->work); 695 &end_io_wq->work);
697 } else { 696 } else {
698 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 697 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
699 btrfs_queue_worker(&fs_info->endio_raid56_workers, 698 btrfs_queue_work(fs_info->endio_raid56_workers,
700 &end_io_wq->work); 699 &end_io_wq->work);
701 else if (end_io_wq->metadata) 700 else if (end_io_wq->metadata)
702 btrfs_queue_worker(&fs_info->endio_meta_workers, 701 btrfs_queue_work(fs_info->endio_meta_workers,
703 &end_io_wq->work); 702 &end_io_wq->work);
704 else 703 else
705 btrfs_queue_worker(&fs_info->endio_workers, 704 btrfs_queue_work(fs_info->endio_workers,
706 &end_io_wq->work); 705 &end_io_wq->work);
707 } 706 }
708} 707}
709 708
@@ -738,7 +737,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
738unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) 737unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
739{ 738{
740 unsigned long limit = min_t(unsigned long, 739 unsigned long limit = min_t(unsigned long,
741 info->workers.max_workers, 740 info->thread_pool_size,
742 info->fs_devices->open_devices); 741 info->fs_devices->open_devices);
743 return 256 * limit; 742 return 256 * limit;
744} 743}
@@ -811,11 +810,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
811 async->submit_bio_start = submit_bio_start; 810 async->submit_bio_start = submit_bio_start;
812 async->submit_bio_done = submit_bio_done; 811 async->submit_bio_done = submit_bio_done;
813 812
814 async->work.func = run_one_async_start; 813 btrfs_init_work(&async->work, run_one_async_start,
815 async->work.ordered_func = run_one_async_done; 814 run_one_async_done, run_one_async_free);
816 async->work.ordered_free = run_one_async_free;
817 815
818 async->work.flags = 0;
819 async->bio_flags = bio_flags; 816 async->bio_flags = bio_flags;
820 async->bio_offset = bio_offset; 817 async->bio_offset = bio_offset;
821 818
@@ -824,9 +821,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
824 atomic_inc(&fs_info->nr_async_submits); 821 atomic_inc(&fs_info->nr_async_submits);
825 822
826 if (rw & REQ_SYNC) 823 if (rw & REQ_SYNC)
827 btrfs_set_work_high_prio(&async->work); 824 btrfs_set_work_high_priority(&async->work);
828 825
829 btrfs_queue_worker(&fs_info->workers, &async->work); 826 btrfs_queue_work(fs_info->workers, &async->work);
830 827
831 while (atomic_read(&fs_info->async_submit_draining) && 828 while (atomic_read(&fs_info->async_submit_draining) &&
832 atomic_read(&fs_info->nr_async_submits)) { 829 atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1146,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1149 } 1146 }
1150} 1147}
1151 1148
1149static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1150{
1151 struct btrfs_subvolume_writers *writers;
1152 int ret;
1153
1154 writers = kmalloc(sizeof(*writers), GFP_NOFS);
1155 if (!writers)
1156 return ERR_PTR(-ENOMEM);
1157
1158 ret = percpu_counter_init(&writers->counter, 0);
1159 if (ret < 0) {
1160 kfree(writers);
1161 return ERR_PTR(ret);
1162 }
1163
1164 init_waitqueue_head(&writers->wait);
1165 return writers;
1166}
1167
1168static void
1169btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1170{
1171 percpu_counter_destroy(&writers->counter);
1172 kfree(writers);
1173}
1174
1152static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1175static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 u32 stripesize, struct btrfs_root *root, 1176 u32 stripesize, struct btrfs_root *root,
1154 struct btrfs_fs_info *fs_info, 1177 struct btrfs_fs_info *fs_info,
@@ -1194,16 +1217,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1194 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
1195 mutex_init(&root->objectid_mutex); 1218 mutex_init(&root->objectid_mutex);
1196 mutex_init(&root->log_mutex); 1219 mutex_init(&root->log_mutex);
1220 mutex_init(&root->ordered_extent_mutex);
1221 mutex_init(&root->delalloc_mutex);
1197 init_waitqueue_head(&root->log_writer_wait); 1222 init_waitqueue_head(&root->log_writer_wait);
1198 init_waitqueue_head(&root->log_commit_wait[0]); 1223 init_waitqueue_head(&root->log_commit_wait[0]);
1199 init_waitqueue_head(&root->log_commit_wait[1]); 1224 init_waitqueue_head(&root->log_commit_wait[1]);
1225 INIT_LIST_HEAD(&root->log_ctxs[0]);
1226 INIT_LIST_HEAD(&root->log_ctxs[1]);
1200 atomic_set(&root->log_commit[0], 0); 1227 atomic_set(&root->log_commit[0], 0);
1201 atomic_set(&root->log_commit[1], 0); 1228 atomic_set(&root->log_commit[1], 0);
1202 atomic_set(&root->log_writers, 0); 1229 atomic_set(&root->log_writers, 0);
1203 atomic_set(&root->log_batch, 0); 1230 atomic_set(&root->log_batch, 0);
1204 atomic_set(&root->orphan_inodes, 0); 1231 atomic_set(&root->orphan_inodes, 0);
1205 atomic_set(&root->refs, 1); 1232 atomic_set(&root->refs, 1);
1233 atomic_set(&root->will_be_snapshoted, 0);
1206 root->log_transid = 0; 1234 root->log_transid = 0;
1235 root->log_transid_committed = -1;
1207 root->last_log_commit = 0; 1236 root->last_log_commit = 0;
1208 if (fs_info) 1237 if (fs_info)
1209 extent_io_tree_init(&root->dirty_log_pages, 1238 extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1446,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1417 WARN_ON(root->log_root); 1446 WARN_ON(root->log_root);
1418 root->log_root = log_root; 1447 root->log_root = log_root;
1419 root->log_transid = 0; 1448 root->log_transid = 0;
1449 root->log_transid_committed = -1;
1420 root->last_log_commit = 0; 1450 root->last_log_commit = 0;
1421 return 0; 1451 return 0;
1422} 1452}
@@ -1498,6 +1528,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1498int btrfs_init_fs_root(struct btrfs_root *root) 1528int btrfs_init_fs_root(struct btrfs_root *root)
1499{ 1529{
1500 int ret; 1530 int ret;
1531 struct btrfs_subvolume_writers *writers;
1501 1532
1502 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1533 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1503 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1534 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,6 +1538,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1507 goto fail; 1538 goto fail;
1508 } 1539 }
1509 1540
1541 writers = btrfs_alloc_subvolume_writers();
1542 if (IS_ERR(writers)) {
1543 ret = PTR_ERR(writers);
1544 goto fail;
1545 }
1546 root->subv_writers = writers;
1547
1510 btrfs_init_free_ino_ctl(root); 1548 btrfs_init_free_ino_ctl(root);
1511 mutex_init(&root->fs_commit_mutex); 1549 mutex_init(&root->fs_commit_mutex);
1512 spin_lock_init(&root->cache_lock); 1550 spin_lock_init(&root->cache_lock);
@@ -1514,8 +1552,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1514 1552
1515 ret = get_anon_bdev(&root->anon_dev); 1553 ret = get_anon_bdev(&root->anon_dev);
1516 if (ret) 1554 if (ret)
1517 goto fail; 1555 goto free_writers;
1518 return 0; 1556 return 0;
1557
1558free_writers:
1559 btrfs_free_subvolume_writers(root->subv_writers);
1519fail: 1560fail:
1520 kfree(root->free_ino_ctl); 1561 kfree(root->free_ino_ctl);
1521 kfree(root->free_ino_pinned); 1562 kfree(root->free_ino_pinned);
@@ -1990,23 +2031,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
1990/* helper to cleanup workers */ 2031/* helper to cleanup workers */
1991static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) 2032static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1992{ 2033{
1993 btrfs_stop_workers(&fs_info->generic_worker); 2034 btrfs_destroy_workqueue(fs_info->fixup_workers);
1994 btrfs_stop_workers(&fs_info->fixup_workers); 2035 btrfs_destroy_workqueue(fs_info->delalloc_workers);
1995 btrfs_stop_workers(&fs_info->delalloc_workers); 2036 btrfs_destroy_workqueue(fs_info->workers);
1996 btrfs_stop_workers(&fs_info->workers); 2037 btrfs_destroy_workqueue(fs_info->endio_workers);
1997 btrfs_stop_workers(&fs_info->endio_workers); 2038 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
1998 btrfs_stop_workers(&fs_info->endio_meta_workers); 2039 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1999 btrfs_stop_workers(&fs_info->endio_raid56_workers); 2040 btrfs_destroy_workqueue(fs_info->rmw_workers);
2000 btrfs_stop_workers(&fs_info->rmw_workers); 2041 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2001 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2042 btrfs_destroy_workqueue(fs_info->endio_write_workers);
2002 btrfs_stop_workers(&fs_info->endio_write_workers); 2043 btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2003 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2044 btrfs_destroy_workqueue(fs_info->submit_workers);
2004 btrfs_stop_workers(&fs_info->submit_workers); 2045 btrfs_destroy_workqueue(fs_info->delayed_workers);
2005 btrfs_stop_workers(&fs_info->delayed_workers); 2046 btrfs_destroy_workqueue(fs_info->caching_workers);
2006 btrfs_stop_workers(&fs_info->caching_workers); 2047 btrfs_destroy_workqueue(fs_info->readahead_workers);
2007 btrfs_stop_workers(&fs_info->readahead_workers); 2048 btrfs_destroy_workqueue(fs_info->flush_workers);
2008 btrfs_stop_workers(&fs_info->flush_workers); 2049 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2009 btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
2010} 2050}
2011 2051
2012static void free_root_extent_buffers(struct btrfs_root *root) 2052static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2137,8 @@ int open_ctree(struct super_block *sb,
2097 int err = -EINVAL; 2137 int err = -EINVAL;
2098 int num_backups_tried = 0; 2138 int num_backups_tried = 0;
2099 int backup_index = 0; 2139 int backup_index = 0;
2140 int max_active;
2141 int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2100 bool create_uuid_tree; 2142 bool create_uuid_tree;
2101 bool check_uuid_tree; 2143 bool check_uuid_tree;
2102 2144
@@ -2133,10 +2175,16 @@ int open_ctree(struct super_block *sb,
2133 goto fail_dirty_metadata_bytes; 2175 goto fail_dirty_metadata_bytes;
2134 } 2176 }
2135 2177
2178 ret = percpu_counter_init(&fs_info->bio_counter, 0);
2179 if (ret) {
2180 err = ret;
2181 goto fail_delalloc_bytes;
2182 }
2183
2136 fs_info->btree_inode = new_inode(sb); 2184 fs_info->btree_inode = new_inode(sb);
2137 if (!fs_info->btree_inode) { 2185 if (!fs_info->btree_inode) {
2138 err = -ENOMEM; 2186 err = -ENOMEM;
2139 goto fail_delalloc_bytes; 2187 goto fail_bio_counter;
2140 } 2188 }
2141 2189
2142 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2190 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2207,7 @@ int open_ctree(struct super_block *sb,
2159 spin_lock_init(&fs_info->buffer_lock); 2207 spin_lock_init(&fs_info->buffer_lock);
2160 rwlock_init(&fs_info->tree_mod_log_lock); 2208 rwlock_init(&fs_info->tree_mod_log_lock);
2161 mutex_init(&fs_info->reloc_mutex); 2209 mutex_init(&fs_info->reloc_mutex);
2210 mutex_init(&fs_info->delalloc_root_mutex);
2162 seqlock_init(&fs_info->profiles_lock); 2211 seqlock_init(&fs_info->profiles_lock);
2163 2212
2164 init_completion(&fs_info->kobj_unregister); 2213 init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2260,7 @@ int open_ctree(struct super_block *sb,
2211 atomic_set(&fs_info->scrub_pause_req, 0); 2260 atomic_set(&fs_info->scrub_pause_req, 0);
2212 atomic_set(&fs_info->scrubs_paused, 0); 2261 atomic_set(&fs_info->scrubs_paused, 0);
2213 atomic_set(&fs_info->scrub_cancel_req, 0); 2262 atomic_set(&fs_info->scrub_cancel_req, 0);
2263 init_waitqueue_head(&fs_info->replace_wait);
2214 init_waitqueue_head(&fs_info->scrub_pause_wait); 2264 init_waitqueue_head(&fs_info->scrub_pause_wait);
2215 fs_info->scrub_workers_refcnt = 0; 2265 fs_info->scrub_workers_refcnt = 0;
2216#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2266#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2458,104 +2508,68 @@ int open_ctree(struct super_block *sb,
2458 goto fail_alloc; 2508 goto fail_alloc;
2459 } 2509 }
2460 2510
2461 btrfs_init_workers(&fs_info->generic_worker, 2511 max_active = fs_info->thread_pool_size;
2462 "genwork", 1, NULL);
2463
2464 btrfs_init_workers(&fs_info->workers, "worker",
2465 fs_info->thread_pool_size,
2466 &fs_info->generic_worker);
2467 2512
2468 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 2513 fs_info->workers =
2469 fs_info->thread_pool_size, NULL); 2514 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2515 max_active, 16);
2470 2516
2471 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2517 fs_info->delalloc_workers =
2472 fs_info->thread_pool_size, NULL); 2518 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2473 2519
2474 btrfs_init_workers(&fs_info->submit_workers, "submit", 2520 fs_info->flush_workers =
2475 min_t(u64, fs_devices->num_devices, 2521 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2476 fs_info->thread_pool_size), NULL);
2477 2522
2478 btrfs_init_workers(&fs_info->caching_workers, "cache", 2523 fs_info->caching_workers =
2479 fs_info->thread_pool_size, NULL); 2524 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2480 2525
2481 /* a higher idle thresh on the submit workers makes it much more 2526 /*
2527 * a higher idle thresh on the submit workers makes it much more
2482 * likely that bios will be send down in a sane order to the 2528 * likely that bios will be send down in a sane order to the
2483 * devices 2529 * devices
2484 */ 2530 */
2485 fs_info->submit_workers.idle_thresh = 64; 2531 fs_info->submit_workers =
2486 2532 btrfs_alloc_workqueue("submit", flags,
2487 fs_info->workers.idle_thresh = 16; 2533 min_t(u64, fs_devices->num_devices,
2488 fs_info->workers.ordered = 1; 2534 max_active), 64);
2489 2535
2490 fs_info->delalloc_workers.idle_thresh = 2; 2536 fs_info->fixup_workers =
2491 fs_info->delalloc_workers.ordered = 1; 2537 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2492
2493 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2494 &fs_info->generic_worker);
2495 btrfs_init_workers(&fs_info->endio_workers, "endio",
2496 fs_info->thread_pool_size,
2497 &fs_info->generic_worker);
2498 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2499 fs_info->thread_pool_size,
2500 &fs_info->generic_worker);
2501 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2502 "endio-meta-write", fs_info->thread_pool_size,
2503 &fs_info->generic_worker);
2504 btrfs_init_workers(&fs_info->endio_raid56_workers,
2505 "endio-raid56", fs_info->thread_pool_size,
2506 &fs_info->generic_worker);
2507 btrfs_init_workers(&fs_info->rmw_workers,
2508 "rmw", fs_info->thread_pool_size,
2509 &fs_info->generic_worker);
2510 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2511 fs_info->thread_pool_size,
2512 &fs_info->generic_worker);
2513 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2514 1, &fs_info->generic_worker);
2515 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2516 fs_info->thread_pool_size,
2517 &fs_info->generic_worker);
2518 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2519 fs_info->thread_pool_size,
2520 &fs_info->generic_worker);
2521 btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
2522 &fs_info->generic_worker);
2523 2538
2524 /* 2539 /*
2525 * endios are largely parallel and should have a very 2540 * endios are largely parallel and should have a very
2526 * low idle thresh 2541 * low idle thresh
2527 */ 2542 */
2528 fs_info->endio_workers.idle_thresh = 4; 2543 fs_info->endio_workers =
2529 fs_info->endio_meta_workers.idle_thresh = 4; 2544 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2530 fs_info->endio_raid56_workers.idle_thresh = 4; 2545 fs_info->endio_meta_workers =
2531 fs_info->rmw_workers.idle_thresh = 2; 2546 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2532 2547 fs_info->endio_meta_write_workers =
2533 fs_info->endio_write_workers.idle_thresh = 2; 2548 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2534 fs_info->endio_meta_write_workers.idle_thresh = 2; 2549 fs_info->endio_raid56_workers =
2535 fs_info->readahead_workers.idle_thresh = 2; 2550 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2536 2551 fs_info->rmw_workers =
2537 /* 2552 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2538 * btrfs_start_workers can really only fail because of ENOMEM so just 2553 fs_info->endio_write_workers =
2539 * return -ENOMEM if any of these fail. 2554 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2540 */ 2555 fs_info->endio_freespace_worker =
2541 ret = btrfs_start_workers(&fs_info->workers); 2556 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2542 ret |= btrfs_start_workers(&fs_info->generic_worker); 2557 fs_info->delayed_workers =
2543 ret |= btrfs_start_workers(&fs_info->submit_workers); 2558 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2544 ret |= btrfs_start_workers(&fs_info->delalloc_workers); 2559 fs_info->readahead_workers =
2545 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2560 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2546 ret |= btrfs_start_workers(&fs_info->endio_workers); 2561 fs_info->qgroup_rescan_workers =
2547 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2562 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2548 ret |= btrfs_start_workers(&fs_info->rmw_workers); 2563
2549 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2564 if (!(fs_info->workers && fs_info->delalloc_workers &&
2550 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2565 fs_info->submit_workers && fs_info->flush_workers &&
2551 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2566 fs_info->endio_workers && fs_info->endio_meta_workers &&
2552 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2567 fs_info->endio_meta_write_workers &&
2553 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2568 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2554 ret |= btrfs_start_workers(&fs_info->caching_workers); 2569 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2555 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2570 fs_info->caching_workers && fs_info->readahead_workers &&
2556 ret |= btrfs_start_workers(&fs_info->flush_workers); 2571 fs_info->fixup_workers && fs_info->delayed_workers &&
2557 ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); 2572 fs_info->qgroup_rescan_workers)) {
2558 if (ret) {
2559 err = -ENOMEM; 2573 err = -ENOMEM;
2560 goto fail_sb_buffer; 2574 goto fail_sb_buffer;
2561 } 2575 }
@@ -2963,6 +2977,8 @@ fail_iput:
2963 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2977 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2964 2978
2965 iput(fs_info->btree_inode); 2979 iput(fs_info->btree_inode);
2980fail_bio_counter:
2981 percpu_counter_destroy(&fs_info->bio_counter);
2966fail_delalloc_bytes: 2982fail_delalloc_bytes:
2967 percpu_counter_destroy(&fs_info->delalloc_bytes); 2983 percpu_counter_destroy(&fs_info->delalloc_bytes);
2968fail_dirty_metadata_bytes: 2984fail_dirty_metadata_bytes:
@@ -3244,6 +3260,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3244 /* send down all the barriers */ 3260 /* send down all the barriers */
3245 head = &info->fs_devices->devices; 3261 head = &info->fs_devices->devices;
3246 list_for_each_entry_rcu(dev, head, dev_list) { 3262 list_for_each_entry_rcu(dev, head, dev_list) {
3263 if (dev->missing)
3264 continue;
3247 if (!dev->bdev) { 3265 if (!dev->bdev) {
3248 errors_send++; 3266 errors_send++;
3249 continue; 3267 continue;
@@ -3258,6 +3276,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3258 3276
3259 /* wait for all the barriers */ 3277 /* wait for all the barriers */
3260 list_for_each_entry_rcu(dev, head, dev_list) { 3278 list_for_each_entry_rcu(dev, head, dev_list) {
3279 if (dev->missing)
3280 continue;
3261 if (!dev->bdev) { 3281 if (!dev->bdev) {
3262 errors_wait++; 3282 errors_wait++;
3263 continue; 3283 continue;
@@ -3477,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
3477 root->orphan_block_rsv = NULL; 3497 root->orphan_block_rsv = NULL;
3478 if (root->anon_dev) 3498 if (root->anon_dev)
3479 free_anon_bdev(root->anon_dev); 3499 free_anon_bdev(root->anon_dev);
3500 if (root->subv_writers)
3501 btrfs_free_subvolume_writers(root->subv_writers);
3480 free_extent_buffer(root->node); 3502 free_extent_buffer(root->node);
3481 free_extent_buffer(root->commit_root); 3503 free_extent_buffer(root->commit_root);
3482 kfree(root->free_ino_ctl); 3504 kfree(root->free_ino_ctl);
@@ -3610,6 +3632,7 @@ int close_ctree(struct btrfs_root *root)
3610 3632
3611 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 3633 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3612 percpu_counter_destroy(&fs_info->delalloc_bytes); 3634 percpu_counter_destroy(&fs_info->delalloc_bytes);
3635 percpu_counter_destroy(&fs_info->bio_counter);
3613 bdi_destroy(&fs_info->bdi); 3636 bdi_destroy(&fs_info->bdi);
3614 cleanup_srcu_struct(&fs_info->subvol_srcu); 3637 cleanup_srcu_struct(&fs_info->subvol_srcu);
3615 3638
@@ -3791,9 +3814,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3791 list_move_tail(&root->ordered_root, 3814 list_move_tail(&root->ordered_root,
3792 &fs_info->ordered_roots); 3815 &fs_info->ordered_roots);
3793 3816
3817 spin_unlock(&fs_info->ordered_root_lock);
3794 btrfs_destroy_ordered_extents(root); 3818 btrfs_destroy_ordered_extents(root);
3795 3819
3796 cond_resched_lock(&fs_info->ordered_root_lock); 3820 cond_resched();
3821 spin_lock(&fs_info->ordered_root_lock);
3797 } 3822 }
3798 spin_unlock(&fs_info->ordered_root_lock); 3823 spin_unlock(&fs_info->ordered_root_lock);
3799} 3824}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..c6b6a6e3e735 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
549 caching_ctl->block_group = cache; 549 caching_ctl->block_group = cache;
550 caching_ctl->progress = cache->key.objectid; 550 caching_ctl->progress = cache->key.objectid;
551 atomic_set(&caching_ctl->count, 1); 551 atomic_set(&caching_ctl->count, 1);
552 caching_ctl->work.func = caching_thread; 552 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
553 553
554 spin_lock(&cache->lock); 554 spin_lock(&cache->lock);
555 /* 555 /*
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
640 640
641 btrfs_get_block_group(cache); 641 btrfs_get_block_group(cache);
642 642
643 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 643 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
644 644
645 return ret; 645 return ret;
646} 646}
@@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root,
3971} 3971}
3972 3972
3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3974 unsigned long nr_pages) 3974 unsigned long nr_pages, int nr_items)
3975{ 3975{
3976 struct super_block *sb = root->fs_info->sb; 3976 struct super_block *sb = root->fs_info->sb;
3977 3977
@@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3986 * the filesystem is readonly(all dirty pages are written to 3986 * the filesystem is readonly(all dirty pages are written to
3987 * the disk). 3987 * the disk).
3988 */ 3988 */
3989 btrfs_start_delalloc_roots(root->fs_info, 0); 3989 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
3990 if (!current->journal_info) 3990 if (!current->journal_info)
3991 btrfs_wait_ordered_roots(root->fs_info, -1); 3991 btrfs_wait_ordered_roots(root->fs_info, nr_items);
3992 } 3992 }
3993} 3993}
3994 3994
@@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4045 while (delalloc_bytes && loops < 3) { 4045 while (delalloc_bytes && loops < 3) {
4046 max_reclaim = min(delalloc_bytes, to_reclaim); 4046 max_reclaim = min(delalloc_bytes, to_reclaim);
4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4048 btrfs_writeback_inodes_sb_nr(root, nr_pages); 4048 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4049 /* 4049 /*
4050 * We need to wait for the async pages to actually start before 4050 * We need to wait for the async pages to actually start before
4051 * we do anything. 4051 * we do anything.
@@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root,
4112 goto commit; 4112 goto commit;
4113 4113
4114 /* See if there is enough pinned space to make this reservation */ 4114 /* See if there is enough pinned space to make this reservation */
4115 spin_lock(&space_info->lock);
4116 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4115 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4117 bytes) >= 0) { 4116 bytes) >= 0)
4118 spin_unlock(&space_info->lock);
4119 goto commit; 4117 goto commit;
4120 }
4121 spin_unlock(&space_info->lock);
4122 4118
4123 /* 4119 /*
4124 * See if there is some space in the delayed insertion reservation for 4120 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root,
4127 if (space_info != delayed_rsv->space_info) 4123 if (space_info != delayed_rsv->space_info)
4128 return -ENOSPC; 4124 return -ENOSPC;
4129 4125
4130 spin_lock(&space_info->lock);
4131 spin_lock(&delayed_rsv->lock); 4126 spin_lock(&delayed_rsv->lock);
4132 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4127 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4133 bytes - delayed_rsv->size) >= 0) { 4128 bytes - delayed_rsv->size) >= 0) {
4134 spin_unlock(&delayed_rsv->lock); 4129 spin_unlock(&delayed_rsv->lock);
4135 spin_unlock(&space_info->lock);
4136 return -ENOSPC; 4130 return -ENOSPC;
4137 } 4131 }
4138 spin_unlock(&delayed_rsv->lock); 4132 spin_unlock(&delayed_rsv->lock);
4139 spin_unlock(&space_info->lock);
4140 4133
4141commit: 4134commit:
4142 trans = btrfs_join_transaction(root); 4135 trans = btrfs_join_transaction(root);
@@ -4181,7 +4174,7 @@ static int flush_space(struct btrfs_root *root,
4181 break; 4174 break;
4182 case FLUSH_DELALLOC: 4175 case FLUSH_DELALLOC:
4183 case FLUSH_DELALLOC_WAIT: 4176 case FLUSH_DELALLOC_WAIT:
4184 shrink_delalloc(root, num_bytes, orig_bytes, 4177 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4185 state == FLUSH_DELALLOC_WAIT); 4178 state == FLUSH_DELALLOC_WAIT);
4186 break; 4179 break;
4187 case ALLOC_CHUNK: 4180 case ALLOC_CHUNK:
@@ -8938,3 +8931,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8938 range->len = trimmed; 8931 range->len = trimmed;
8939 return ret; 8932 return ret;
8940} 8933}
8934
8935/*
8936 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
8937 * they are used to prevent the some tasks writing data into the page cache
8938 * by nocow before the subvolume is snapshoted, but flush the data into
8939 * the disk after the snapshot creation.
8940 */
8941void btrfs_end_nocow_write(struct btrfs_root *root)
8942{
8943 percpu_counter_dec(&root->subv_writers->counter);
8944 /*
8945 * Make sure counter is updated before we wake up
8946 * waiters.
8947 */
8948 smp_mb();
8949 if (waitqueue_active(&root->subv_writers->wait))
8950 wake_up(&root->subv_writers->wait);
8951}
8952
8953int btrfs_start_nocow_write(struct btrfs_root *root)
8954{
8955 if (unlikely(atomic_read(&root->will_be_snapshoted)))
8956 return 0;
8957
8958 percpu_counter_inc(&root->subv_writers->counter);
8959 /*
8960 * Make sure counter is updated before we check for snapshot creation.
8961 */
8962 smp_mb();
8963 if (unlikely(atomic_read(&root->will_be_snapshoted))) {
8964 btrfs_end_nocow_write(root);
8965 return 0;
8966 }
8967 return 1;
8968}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..ae69a00387e7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
229 } 229 }
230} 230}
231 231
232static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 232static struct rb_node *tree_insert(struct rb_root *root,
233 struct rb_node *search_start,
234 u64 offset,
233 struct rb_node *node, 235 struct rb_node *node,
234 struct rb_node ***p_in, 236 struct rb_node ***p_in,
235 struct rb_node **parent_in) 237 struct rb_node **parent_in)
236{ 238{
237 struct rb_node **p = &root->rb_node; 239 struct rb_node **p;
238 struct rb_node *parent = NULL; 240 struct rb_node *parent = NULL;
239 struct tree_entry *entry; 241 struct tree_entry *entry;
240 242
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
244 goto do_insert; 246 goto do_insert;
245 } 247 }
246 248
249 p = search_start ? &search_start : &root->rb_node;
247 while (*p) { 250 while (*p) {
248 parent = *p; 251 parent = *p;
249 entry = rb_entry(parent, struct tree_entry, rb_node); 252 entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
430 433
431 set_state_bits(tree, state, bits); 434 set_state_bits(tree, state, bits);
432 435
433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent); 436 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
434 if (node) { 437 if (node) {
435 struct extent_state *found; 438 struct extent_state *found;
436 found = rb_entry(node, struct extent_state, rb_node); 439 found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
477 prealloc->state = orig->state; 480 prealloc->state = orig->state;
478 orig->start = split; 481 orig->start = split;
479 482
480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, 483 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
481 NULL, NULL); 484 &prealloc->rb_node, NULL, NULL);
482 if (node) { 485 if (node) {
483 free_extent_state(prealloc); 486 free_extent_state(prealloc);
484 return -EEXIST; 487 return -EEXIST;
@@ -2757,7 +2760,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2757 2760
2758 if (em_cached && *em_cached) { 2761 if (em_cached && *em_cached) {
2759 em = *em_cached; 2762 em = *em_cached;
2760 if (em->in_tree && start >= em->start && 2763 if (extent_map_in_tree(em) && start >= em->start &&
2761 start < extent_map_end(em)) { 2764 start < extent_map_end(em)) {
2762 atomic_inc(&em->refs); 2765 atomic_inc(&em->refs);
2763 return em; 2766 return em;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
52 if (!em) 52 if (!em)
53 return NULL; 53 return NULL;
54 em->in_tree = 0; 54 RB_CLEAR_NODE(&em->rb_node);
55 em->flags = 0; 55 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 56 em->compress_type = BTRFS_COMPRESS_NONE;
57 em->generation = 0; 57 em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
73 return; 73 return;
74 WARN_ON(atomic_read(&em->refs) == 0); 74 WARN_ON(atomic_read(&em->refs) == 0);
75 if (atomic_dec_and_test(&em->refs)) { 75 if (atomic_dec_and_test(&em->refs)) {
76 WARN_ON(em->in_tree); 76 WARN_ON(extent_map_in_tree(em));
77 WARN_ON(!list_empty(&em->list)); 77 WARN_ON(!list_empty(&em->list));
78 kmem_cache_free(extent_map_cache, em); 78 kmem_cache_free(extent_map_cache, em);
79 } 79 }
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
99 parent = *p; 99 parent = *p;
100 entry = rb_entry(parent, struct extent_map, rb_node); 100 entry = rb_entry(parent, struct extent_map, rb_node);
101 101
102 WARN_ON(!entry->in_tree);
103
104 if (em->start < entry->start) 102 if (em->start < entry->start)
105 p = &(*p)->rb_left; 103 p = &(*p)->rb_left;
106 else if (em->start >= extent_map_end(entry)) 104 else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
128 if (end > entry->start && em->start < extent_map_end(entry)) 126 if (end > entry->start && em->start < extent_map_end(entry))
129 return -EEXIST; 127 return -EEXIST;
130 128
131 em->in_tree = 1;
132 rb_link_node(&em->rb_node, orig_parent, p); 129 rb_link_node(&em->rb_node, orig_parent, p);
133 rb_insert_color(&em->rb_node, root); 130 rb_insert_color(&em->rb_node, root);
134 return 0; 131 return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
153 prev = n; 150 prev = n;
154 prev_entry = entry; 151 prev_entry = entry;
155 152
156 WARN_ON(!entry->in_tree);
157
158 if (offset < entry->start) 153 if (offset < entry->start)
159 n = n->rb_left; 154 n = n->rb_left;
160 else if (offset >= extent_map_end(entry)) 155 else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
240 em->len += merge->len; 235 em->len += merge->len;
241 em->block_len += merge->block_len; 236 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 237 em->block_start = merge->block_start;
243 merge->in_tree = 0;
244 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 238 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
245 em->mod_start = merge->mod_start; 239 em->mod_start = merge->mod_start;
246 em->generation = max(em->generation, merge->generation); 240 em->generation = max(em->generation, merge->generation);
247 241
248 rb_erase(&merge->rb_node, &tree->map); 242 rb_erase(&merge->rb_node, &tree->map);
243 RB_CLEAR_NODE(&merge->rb_node);
249 free_extent_map(merge); 244 free_extent_map(merge);
250 } 245 }
251 } 246 }
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
257 em->len += merge->len; 252 em->len += merge->len;
258 em->block_len += merge->block_len; 253 em->block_len += merge->block_len;
259 rb_erase(&merge->rb_node, &tree->map); 254 rb_erase(&merge->rb_node, &tree->map);
260 merge->in_tree = 0; 255 RB_CLEAR_NODE(&merge->rb_node);
261 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 256 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
262 em->generation = max(em->generation, merge->generation); 257 em->generation = max(em->generation, merge->generation);
263 free_extent_map(merge); 258 free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
319void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) 314void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
320{ 315{
321 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 316 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
322 if (em->in_tree) 317 if (extent_map_in_tree(em))
318 try_merge_map(tree, em);
319}
320
321static inline void setup_extent_mapping(struct extent_map_tree *tree,
322 struct extent_map *em,
323 int modified)
324{
325 atomic_inc(&em->refs);
326 em->mod_start = em->start;
327 em->mod_len = em->len;
328
329 if (modified)
330 list_move(&em->list, &tree->modified_extents);
331 else
323 try_merge_map(tree, em); 332 try_merge_map(tree, em);
324} 333}
325 334
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
342 if (ret) 351 if (ret)
343 goto out; 352 goto out;
344 353
345 atomic_inc(&em->refs); 354 setup_extent_mapping(tree, em, modified);
346
347 em->mod_start = em->start;
348 em->mod_len = em->len;
349
350 if (modified)
351 list_move(&em->list, &tree->modified_extents);
352 else
353 try_merge_map(tree, em);
354out: 355out:
355 return ret; 356 return ret;
356} 357}
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
434 rb_erase(&em->rb_node, &tree->map); 435 rb_erase(&em->rb_node, &tree->map);
435 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 436 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
436 list_del_init(&em->list); 437 list_del_init(&em->list);
437 em->in_tree = 0; 438 RB_CLEAR_NODE(&em->rb_node);
438 return ret; 439 return ret;
439} 440}
441
442void replace_extent_mapping(struct extent_map_tree *tree,
443 struct extent_map *cur,
444 struct extent_map *new,
445 int modified)
446{
447 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
448 ASSERT(extent_map_in_tree(cur));
449 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
450 list_del_init(&cur->list);
451 rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
452 RB_CLEAR_NODE(&cur->rb_node);
453
454 setup_extent_mapping(tree, new, modified);
455}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
33 unsigned long flags; 33 unsigned long flags;
34 struct block_device *bdev; 34 struct block_device *bdev;
35 atomic_t refs; 35 atomic_t refs;
36 unsigned int in_tree;
37 unsigned int compress_type; 36 unsigned int compress_type;
38 struct list_head list; 37 struct list_head list;
39}; 38};
@@ -44,6 +43,11 @@ struct extent_map_tree {
44 rwlock_t lock; 43 rwlock_t lock;
45}; 44};
46 45
46static inline int extent_map_in_tree(const struct extent_map *em)
47{
48 return !RB_EMPTY_NODE(&em->rb_node);
49}
50
47static inline u64 extent_map_end(struct extent_map *em) 51static inline u64 extent_map_end(struct extent_map *em)
48{ 52{
49 if (em->start + em->len < em->start) 53 if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
64int add_extent_mapping(struct extent_map_tree *tree, 68int add_extent_mapping(struct extent_map_tree *tree,
65 struct extent_map *em, int modified); 69 struct extent_map *em, int modified);
66int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); 70int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
71void replace_extent_mapping(struct extent_map_tree *tree,
72 struct extent_map *cur,
73 struct extent_map *new,
74 int modified);
67 75
68struct extent_map *alloc_extent_map(void); 76struct extent_map *alloc_extent_map(void);
69void free_extent_map(struct extent_map *em); 77void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7331a230e30b..e1ffb1e22898 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
591 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 591 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
592 clear_bit(EXTENT_FLAG_LOGGING, &flags); 592 clear_bit(EXTENT_FLAG_LOGGING, &flags);
593 modified = !list_empty(&em->list); 593 modified = !list_empty(&em->list);
594 remove_extent_mapping(em_tree, em);
595 if (no_splits) 594 if (no_splits)
596 goto next; 595 goto next;
597 596
@@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
622 split->bdev = em->bdev; 621 split->bdev = em->bdev;
623 split->flags = flags; 622 split->flags = flags;
624 split->compress_type = em->compress_type; 623 split->compress_type = em->compress_type;
625 ret = add_extent_mapping(em_tree, split, modified); 624 replace_extent_mapping(em_tree, em, split, modified);
626 BUG_ON(ret); /* Logic error */
627 free_extent_map(split); 625 free_extent_map(split);
628 split = split2; 626 split = split2;
629 split2 = NULL; 627 split2 = NULL;
@@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
661 split->orig_block_len = 0; 659 split->orig_block_len = 0;
662 } 660 }
663 661
664 ret = add_extent_mapping(em_tree, split, modified); 662 if (extent_map_in_tree(em)) {
665 BUG_ON(ret); /* Logic error */ 663 replace_extent_mapping(em_tree, em, split,
664 modified);
665 } else {
666 ret = add_extent_mapping(em_tree, split,
667 modified);
668 ASSERT(ret == 0); /* Logic error */
669 }
666 free_extent_map(split); 670 free_extent_map(split);
667 split = NULL; 671 split = NULL;
668 } 672 }
669next: 673next:
674 if (extent_map_in_tree(em))
675 remove_extent_mapping(em_tree, em);
670 write_unlock(&em_tree->lock); 676 write_unlock(&em_tree->lock);
671 677
672 /* once for us */ 678 /* once for us */
@@ -720,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
720 if (drop_cache) 726 if (drop_cache)
721 btrfs_drop_extent_cache(inode, start, end - 1, 0); 727 btrfs_drop_extent_cache(inode, start, end - 1, 0);
722 728
723 if (start >= BTRFS_I(inode)->disk_i_size) 729 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
724 modify_tree = 0; 730 modify_tree = 0;
725 731
726 while (1) { 732 while (1) {
@@ -798,7 +804,10 @@ next_slot:
798 */ 804 */
799 if (start > key.offset && end < extent_end) { 805 if (start > key.offset && end < extent_end) {
800 BUG_ON(del_nr > 0); 806 BUG_ON(del_nr > 0);
801 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 807 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
808 ret = -EINVAL;
809 break;
810 }
802 811
803 memcpy(&new_key, &key, sizeof(new_key)); 812 memcpy(&new_key, &key, sizeof(new_key));
804 new_key.offset = start; 813 new_key.offset = start;
@@ -841,7 +850,10 @@ next_slot:
841 * | -------- extent -------- | 850 * | -------- extent -------- |
842 */ 851 */
843 if (start <= key.offset && end < extent_end) { 852 if (start <= key.offset && end < extent_end) {
844 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 853 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
854 ret = -EINVAL;
855 break;
856 }
845 857
846 memcpy(&new_key, &key, sizeof(new_key)); 858 memcpy(&new_key, &key, sizeof(new_key));
847 new_key.offset = end; 859 new_key.offset = end;
@@ -864,7 +876,10 @@ next_slot:
864 */ 876 */
865 if (start > key.offset && end >= extent_end) { 877 if (start > key.offset && end >= extent_end) {
866 BUG_ON(del_nr > 0); 878 BUG_ON(del_nr > 0);
867 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 879 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
880 ret = -EINVAL;
881 break;
882 }
868 883
869 btrfs_set_file_extent_num_bytes(leaf, fi, 884 btrfs_set_file_extent_num_bytes(leaf, fi,
870 start - key.offset); 885 start - key.offset);
@@ -938,34 +953,42 @@ next_slot:
938 * Set path->slots[0] to first slot, so that after the delete 953 * Set path->slots[0] to first slot, so that after the delete
939 * if items are move off from our leaf to its immediate left or 954 * if items are move off from our leaf to its immediate left or
940 * right neighbor leafs, we end up with a correct and adjusted 955 * right neighbor leafs, we end up with a correct and adjusted
941 * path->slots[0] for our insertion. 956 * path->slots[0] for our insertion (if replace_extent != 0).
942 */ 957 */
943 path->slots[0] = del_slot; 958 path->slots[0] = del_slot;
944 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 959 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
945 if (ret) 960 if (ret)
946 btrfs_abort_transaction(trans, root, ret); 961 btrfs_abort_transaction(trans, root, ret);
962 }
947 963
948 leaf = path->nodes[0]; 964 leaf = path->nodes[0];
949 /* 965 /*
950 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that 966 * If btrfs_del_items() was called, it might have deleted a leaf, in
951 * is, its contents got pushed to its neighbors), in which case 967 * which case it unlocked our path, so check path->locks[0] matches a
952 * it means path->locks[0] == 0 968 * write lock.
953 */ 969 */
954 if (!ret && replace_extent && leafs_visited == 1 && 970 if (!ret && replace_extent && leafs_visited == 1 &&
955 path->locks[0] && 971 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
956 btrfs_leaf_free_space(root, leaf) >= 972 path->locks[0] == BTRFS_WRITE_LOCK) &&
957 sizeof(struct btrfs_item) + extent_item_size) { 973 btrfs_leaf_free_space(root, leaf) >=
958 974 sizeof(struct btrfs_item) + extent_item_size) {
959 key.objectid = ino; 975
960 key.type = BTRFS_EXTENT_DATA_KEY; 976 key.objectid = ino;
961 key.offset = start; 977 key.type = BTRFS_EXTENT_DATA_KEY;
962 setup_items_for_insert(root, path, &key, 978 key.offset = start;
963 &extent_item_size, 979 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
964 extent_item_size, 980 struct btrfs_key slot_key;
965 sizeof(struct btrfs_item) + 981
966 extent_item_size, 1); 982 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
967 *key_inserted = 1; 983 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
984 path->slots[0]++;
968 } 985 }
986 setup_items_for_insert(root, path, &key,
987 &extent_item_size,
988 extent_item_size,
989 sizeof(struct btrfs_item) +
990 extent_item_size, 1);
991 *key_inserted = 1;
969 } 992 }
970 993
971 if (!replace_extent || !(*key_inserted)) 994 if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1369,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1346 struct btrfs_ordered_extent *ordered; 1369 struct btrfs_ordered_extent *ordered;
1347 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1370 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1348 start_pos, last_pos, 0, cached_state); 1371 start_pos, last_pos, 0, cached_state);
1349 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); 1372 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1373 last_pos - start_pos + 1);
1350 if (ordered && 1374 if (ordered &&
1351 ordered->file_offset + ordered->len > start_pos && 1375 ordered->file_offset + ordered->len > start_pos &&
1352 ordered->file_offset <= last_pos) { 1376 ordered->file_offset <= last_pos) {
1353 btrfs_put_ordered_extent(ordered);
1354 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1377 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1355 start_pos, last_pos, 1378 start_pos, last_pos,
1356 cached_state, GFP_NOFS); 1379 cached_state, GFP_NOFS);
@@ -1358,12 +1381,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1358 unlock_page(pages[i]); 1381 unlock_page(pages[i]);
1359 page_cache_release(pages[i]); 1382 page_cache_release(pages[i]);
1360 } 1383 }
1361 ret = btrfs_wait_ordered_range(inode, start_pos, 1384 btrfs_start_ordered_extent(inode, ordered, 1);
1362 last_pos - start_pos + 1); 1385 btrfs_put_ordered_extent(ordered);
1363 if (ret) 1386 return -EAGAIN;
1364 return ret;
1365 else
1366 return -EAGAIN;
1367 } 1387 }
1368 if (ordered) 1388 if (ordered)
1369 btrfs_put_ordered_extent(ordered); 1389 btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1416,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1396 u64 num_bytes; 1416 u64 num_bytes;
1397 int ret; 1417 int ret;
1398 1418
1419 ret = btrfs_start_nocow_write(root);
1420 if (!ret)
1421 return -ENOSPC;
1422
1399 lockstart = round_down(pos, root->sectorsize); 1423 lockstart = round_down(pos, root->sectorsize);
1400 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; 1424 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1401 1425
1402 while (1) { 1426 while (1) {
1403 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1427 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1439,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1415 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1439 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1416 if (ret <= 0) { 1440 if (ret <= 0) {
1417 ret = 0; 1441 ret = 0;
1442 btrfs_end_nocow_write(root);
1418 } else { 1443 } else {
1419 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1444 *write_bytes = min_t(size_t, *write_bytes ,
1420 EXTENT_DIRTY | EXTENT_DELALLOC | 1445 num_bytes - pos + lockstart);
1421 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1422 NULL, GFP_NOFS);
1423 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1424 } 1446 }
1425 1447
1426 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1448 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1532,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 if (!only_release_metadata) 1532 if (!only_release_metadata)
1511 btrfs_free_reserved_data_space(inode, 1533 btrfs_free_reserved_data_space(inode,
1512 reserve_bytes); 1534 reserve_bytes);
1535 else
1536 btrfs_end_nocow_write(root);
1513 break; 1537 break;
1514 } 1538 }
1515 1539
@@ -1598,6 +1622,9 @@ again:
1598 } 1622 }
1599 1623
1600 release_bytes = 0; 1624 release_bytes = 0;
1625 if (only_release_metadata)
1626 btrfs_end_nocow_write(root);
1627
1601 if (only_release_metadata && copied > 0) { 1628 if (only_release_metadata && copied > 0) {
1602 u64 lockstart = round_down(pos, root->sectorsize); 1629 u64 lockstart = round_down(pos, root->sectorsize);
1603 u64 lockend = lockstart + 1630 u64 lockend = lockstart +
@@ -1624,10 +1651,12 @@ again:
1624 kfree(pages); 1651 kfree(pages);
1625 1652
1626 if (release_bytes) { 1653 if (release_bytes) {
1627 if (only_release_metadata) 1654 if (only_release_metadata) {
1655 btrfs_end_nocow_write(root);
1628 btrfs_delalloc_release_metadata(inode, release_bytes); 1656 btrfs_delalloc_release_metadata(inode, release_bytes);
1629 else 1657 } else {
1630 btrfs_delalloc_release_space(inode, release_bytes); 1658 btrfs_delalloc_release_space(inode, release_bytes);
1659 }
1631 } 1660 }
1632 1661
1633 return num_written ? num_written : ret; 1662 return num_written ? num_written : ret;
@@ -1856,8 +1885,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1856 struct dentry *dentry = file->f_path.dentry; 1885 struct dentry *dentry = file->f_path.dentry;
1857 struct inode *inode = dentry->d_inode; 1886 struct inode *inode = dentry->d_inode;
1858 struct btrfs_root *root = BTRFS_I(inode)->root; 1887 struct btrfs_root *root = BTRFS_I(inode)->root;
1859 int ret = 0;
1860 struct btrfs_trans_handle *trans; 1888 struct btrfs_trans_handle *trans;
1889 struct btrfs_log_ctx ctx;
1890 int ret = 0;
1861 bool full_sync = 0; 1891 bool full_sync = 0;
1862 1892
1863 trace_btrfs_sync_file(file, datasync); 1893 trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1981,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1951 } 1981 }
1952 trans->sync = true; 1982 trans->sync = true;
1953 1983
1954 ret = btrfs_log_dentry_safe(trans, root, dentry); 1984 btrfs_init_log_ctx(&ctx);
1985
1986 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1955 if (ret < 0) { 1987 if (ret < 0) {
1956 /* Fallthrough and commit/free transaction. */ 1988 /* Fallthrough and commit/free transaction. */
1957 ret = 1; 1989 ret = 1;
@@ -1971,7 +2003,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1971 2003
1972 if (ret != BTRFS_NO_LOG_SYNC) { 2004 if (ret != BTRFS_NO_LOG_SYNC) {
1973 if (!ret) { 2005 if (!ret) {
1974 ret = btrfs_sync_log(trans, root); 2006 ret = btrfs_sync_log(trans, root, &ctx);
1975 if (!ret) { 2007 if (!ret) {
1976 ret = btrfs_end_transaction(trans, root); 2008 ret = btrfs_end_transaction(trans, root);
1977 goto out; 2009 goto out;
@@ -2157,6 +2189,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2157 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2189 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2158 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2190 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2159 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2191 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2192 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2160 2193
2161 ret = btrfs_wait_ordered_range(inode, offset, len); 2194 ret = btrfs_wait_ordered_range(inode, offset, len);
2162 if (ret) 2195 if (ret)
@@ -2172,14 +2205,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2172 * entire page. 2205 * entire page.
2173 */ 2206 */
2174 if (same_page && len < PAGE_CACHE_SIZE) { 2207 if (same_page && len < PAGE_CACHE_SIZE) {
2175 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 2208 if (offset < ino_size)
2176 ret = btrfs_truncate_page(inode, offset, len, 0); 2209 ret = btrfs_truncate_page(inode, offset, len, 0);
2177 mutex_unlock(&inode->i_mutex); 2210 mutex_unlock(&inode->i_mutex);
2178 return ret; 2211 return ret;
2179 } 2212 }
2180 2213
2181 /* zero back part of the first page */ 2214 /* zero back part of the first page */
2182 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2215 if (offset < ino_size) {
2183 ret = btrfs_truncate_page(inode, offset, 0, 0); 2216 ret = btrfs_truncate_page(inode, offset, 0, 0);
2184 if (ret) { 2217 if (ret) {
2185 mutex_unlock(&inode->i_mutex); 2218 mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2221,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2188 } 2221 }
2189 2222
2190 /* zero the front end of the last page */ 2223 /* zero the front end of the last page */
2191 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2224 if (offset + len < ino_size) {
2192 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2225 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2193 if (ret) { 2226 if (ret) {
2194 mutex_unlock(&inode->i_mutex); 2227 mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2310,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2277 2310
2278 trans->block_rsv = &root->fs_info->trans_block_rsv; 2311 trans->block_rsv = &root->fs_info->trans_block_rsv;
2279 2312
2280 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2313 if (cur_offset < ino_size) {
2281 if (ret) { 2314 ret = fill_holes(trans, inode, path, cur_offset,
2282 err = ret; 2315 drop_end);
2283 break; 2316 if (ret) {
2317 err = ret;
2318 break;
2319 }
2284 } 2320 }
2285 2321
2286 cur_offset = drop_end; 2322 cur_offset = drop_end;
@@ -2313,10 +2349,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2313 } 2349 }
2314 2350
2315 trans->block_rsv = &root->fs_info->trans_block_rsv; 2351 trans->block_rsv = &root->fs_info->trans_block_rsv;
2316 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2352 if (cur_offset < ino_size) {
2317 if (ret) { 2353 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2318 err = ret; 2354 if (ret) {
2319 goto out_trans; 2355 err = ret;
2356 goto out_trans;
2357 }
2320 } 2358 }
2321 2359
2322out_trans: 2360out_trans:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 49ec1398879f..06e9a4152b14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode,
864 864
865 if (btrfs_is_free_space_inode(inode)) { 865 if (btrfs_is_free_space_inode(inode)) {
866 WARN_ON_ONCE(1); 866 WARN_ON_ONCE(1);
867 return -EINVAL; 867 ret = -EINVAL;
868 goto out_unlock;
868 } 869 }
869 870
870 num_bytes = ALIGN(end - start + 1, blocksize); 871 num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1075 async_cow->end = cur_end; 1076 async_cow->end = cur_end;
1076 INIT_LIST_HEAD(&async_cow->extents); 1077 INIT_LIST_HEAD(&async_cow->extents);
1077 1078
1078 async_cow->work.func = async_cow_start; 1079 btrfs_init_work(&async_cow->work, async_cow_start,
1079 async_cow->work.ordered_func = async_cow_submit; 1080 async_cow_submit, async_cow_free);
1080 async_cow->work.ordered_free = async_cow_free;
1081 async_cow->work.flags = 0;
1082 1081
1083 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1082 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1084 PAGE_CACHE_SHIFT; 1083 PAGE_CACHE_SHIFT;
1085 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1084 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1086 1085
1087 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1086 btrfs_queue_work(root->fs_info->delalloc_workers,
1088 &async_cow->work); 1087 &async_cow->work);
1089 1088
1090 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1089 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1091 wait_event(root->fs_info->async_submit_wait, 1090 wait_event(root->fs_info->async_submit_wait,
@@ -1843,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1843 1842
1844 SetPageChecked(page); 1843 SetPageChecked(page);
1845 page_cache_get(page); 1844 page_cache_get(page);
1846 fixup->work.func = btrfs_writepage_fixup_worker; 1845 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
1847 fixup->page = page; 1846 fixup->page = page;
1848 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1847 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1849 return -EBUSY; 1848 return -EBUSY;
1850} 1849}
1851 1850
@@ -2239,6 +2238,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2239 return PTR_ERR(root); 2238 return PTR_ERR(root);
2240 } 2239 }
2241 2240
2241 if (btrfs_root_readonly(root)) {
2242 srcu_read_unlock(&fs_info->subvol_srcu, index);
2243 return 0;
2244 }
2245
2242 /* step 2: get inode */ 2246 /* step 2: get inode */
2243 key.objectid = backref->inum; 2247 key.objectid = backref->inum;
2244 key.type = BTRFS_INODE_ITEM_KEY; 2248 key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2759 struct inode *inode = page->mapping->host; 2763 struct inode *inode = page->mapping->host;
2760 struct btrfs_root *root = BTRFS_I(inode)->root; 2764 struct btrfs_root *root = BTRFS_I(inode)->root;
2761 struct btrfs_ordered_extent *ordered_extent = NULL; 2765 struct btrfs_ordered_extent *ordered_extent = NULL;
2762 struct btrfs_workers *workers; 2766 struct btrfs_workqueue *workers;
2763 2767
2764 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2768 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2765 2769
@@ -2768,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2768 end - start + 1, uptodate)) 2772 end - start + 1, uptodate))
2769 return 0; 2773 return 0;
2770 2774
2771 ordered_extent->work.func = finish_ordered_fn; 2775 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2772 ordered_extent->work.flags = 0;
2773 2776
2774 if (btrfs_is_free_space_inode(inode)) 2777 if (btrfs_is_free_space_inode(inode))
2775 workers = &root->fs_info->endio_freespace_worker; 2778 workers = root->fs_info->endio_freespace_worker;
2776 else 2779 else
2777 workers = &root->fs_info->endio_write_workers; 2780 workers = root->fs_info->endio_write_workers;
2778 btrfs_queue_worker(workers, &ordered_extent->work); 2781 btrfs_queue_work(workers, &ordered_extent->work);
2779 2782
2780 return 0; 2783 return 0;
2781} 2784}
@@ -4924,7 +4927,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
4924 struct inode *inode; 4927 struct inode *inode;
4925 u64 objectid = 0; 4928 u64 objectid = 0;
4926 4929
4927 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4930 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
4931 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4928 4932
4929 spin_lock(&root->inode_lock); 4933 spin_lock(&root->inode_lock);
4930again: 4934again:
@@ -5799,6 +5803,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5799 } 5803 }
5800out_unlock: 5804out_unlock:
5801 btrfs_end_transaction(trans, root); 5805 btrfs_end_transaction(trans, root);
5806 btrfs_balance_delayed_items(root);
5802 btrfs_btree_balance_dirty(root); 5807 btrfs_btree_balance_dirty(root);
5803 if (drop_inode) { 5808 if (drop_inode) {
5804 inode_dec_link_count(inode); 5809 inode_dec_link_count(inode);
@@ -5872,6 +5877,7 @@ out_unlock:
5872 inode_dec_link_count(inode); 5877 inode_dec_link_count(inode);
5873 iput(inode); 5878 iput(inode);
5874 } 5879 }
5880 btrfs_balance_delayed_items(root);
5875 btrfs_btree_balance_dirty(root); 5881 btrfs_btree_balance_dirty(root);
5876 return err; 5882 return err;
5877} 5883}
@@ -5930,6 +5936,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5930 } 5936 }
5931 5937
5932 btrfs_end_transaction(trans, root); 5938 btrfs_end_transaction(trans, root);
5939 btrfs_balance_delayed_items(root);
5933fail: 5940fail:
5934 if (drop_inode) { 5941 if (drop_inode) {
5935 inode_dec_link_count(inode); 5942 inode_dec_link_count(inode);
@@ -5996,6 +6003,7 @@ out_fail:
5996 btrfs_end_transaction(trans, root); 6003 btrfs_end_transaction(trans, root);
5997 if (drop_on_err) 6004 if (drop_on_err)
5998 iput(inode); 6005 iput(inode);
6006 btrfs_balance_delayed_items(root);
5999 btrfs_btree_balance_dirty(root); 6007 btrfs_btree_balance_dirty(root);
6000 return err; 6008 return err;
6001} 6009}
@@ -6550,6 +6558,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6550 int ret; 6558 int ret;
6551 struct extent_buffer *leaf; 6559 struct extent_buffer *leaf;
6552 struct btrfs_root *root = BTRFS_I(inode)->root; 6560 struct btrfs_root *root = BTRFS_I(inode)->root;
6561 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6553 struct btrfs_file_extent_item *fi; 6562 struct btrfs_file_extent_item *fi;
6554 struct btrfs_key key; 6563 struct btrfs_key key;
6555 u64 disk_bytenr; 6564 u64 disk_bytenr;
@@ -6626,6 +6635,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6626 6635
6627 if (btrfs_extent_readonly(root, disk_bytenr)) 6636 if (btrfs_extent_readonly(root, disk_bytenr))
6628 goto out; 6637 goto out;
6638
6639 num_bytes = min(offset + *len, extent_end) - offset;
6640 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6641 u64 range_end;
6642
6643 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6644 ret = test_range_bit(io_tree, offset, range_end,
6645 EXTENT_DELALLOC, 0, NULL);
6646 if (ret) {
6647 ret = -EAGAIN;
6648 goto out;
6649 }
6650 }
6651
6629 btrfs_release_path(path); 6652 btrfs_release_path(path);
6630 6653
6631 /* 6654 /*
@@ -6654,7 +6677,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6654 */ 6677 */
6655 disk_bytenr += backref_offset; 6678 disk_bytenr += backref_offset;
6656 disk_bytenr += offset - key.offset; 6679 disk_bytenr += offset - key.offset;
6657 num_bytes = min(offset + *len, extent_end) - offset;
6658 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6680 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6659 goto out; 6681 goto out;
6660 /* 6682 /*
@@ -7024,10 +7046,9 @@ again:
7024 if (!ret) 7046 if (!ret)
7025 goto out_test; 7047 goto out_test;
7026 7048
7027 ordered->work.func = finish_ordered_fn; 7049 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
7028 ordered->work.flags = 0; 7050 btrfs_queue_work(root->fs_info->endio_write_workers,
7029 btrfs_queue_worker(&root->fs_info->endio_write_workers, 7051 &ordered->work);
7030 &ordered->work);
7031out_test: 7052out_test:
7032 /* 7053 /*
7033 * our bio might span multiple ordered extents. If we haven't 7054 * our bio might span multiple ordered extents. If we haven't
@@ -7404,15 +7425,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7404 smp_mb__after_atomic_inc(); 7425 smp_mb__after_atomic_inc();
7405 7426
7406 /* 7427 /*
7407 * The generic stuff only does filemap_write_and_wait_range, which isn't 7428 * The generic stuff only does filemap_write_and_wait_range, which
7408 * enough if we've written compressed pages to this area, so we need to 7429 * isn't enough if we've written compressed pages to this area, so
7409 * call btrfs_wait_ordered_range to make absolutely sure that any 7430 * we need to flush the dirty pages again to make absolutely sure
7410 * outstanding dirty pages are on disk. 7431 * that any outstanding dirty pages are on disk.
7411 */ 7432 */
7412 count = iov_length(iov, nr_segs); 7433 count = iov_length(iov, nr_segs);
7413 ret = btrfs_wait_ordered_range(inode, offset, count); 7434 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7414 if (ret) 7435 &BTRFS_I(inode)->runtime_flags))
7415 return ret; 7436 filemap_fdatawrite_range(inode->i_mapping, offset, count);
7416 7437
7417 if (rw & WRITE) { 7438 if (rw & WRITE) {
7418 /* 7439 /*
@@ -8404,7 +8425,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8404 work->inode = inode; 8425 work->inode = inode;
8405 work->wait = wait; 8426 work->wait = wait;
8406 work->delay_iput = delay_iput; 8427 work->delay_iput = delay_iput;
8407 work->work.func = btrfs_run_delalloc_work; 8428 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
8408 8429
8409 return work; 8430 return work;
8410} 8431}
@@ -8419,7 +8440,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8419 * some fairly slow code that needs optimization. This walks the list 8440 * some fairly slow code that needs optimization. This walks the list
8420 * of all the inodes with pending delalloc and forces them to disk. 8441 * of all the inodes with pending delalloc and forces them to disk.
8421 */ 8442 */
8422static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8443static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8444 int nr)
8423{ 8445{
8424 struct btrfs_inode *binode; 8446 struct btrfs_inode *binode;
8425 struct inode *inode; 8447 struct inode *inode;
@@ -8431,6 +8453,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8431 INIT_LIST_HEAD(&works); 8453 INIT_LIST_HEAD(&works);
8432 INIT_LIST_HEAD(&splice); 8454 INIT_LIST_HEAD(&splice);
8433 8455
8456 mutex_lock(&root->delalloc_mutex);
8434 spin_lock(&root->delalloc_lock); 8457 spin_lock(&root->delalloc_lock);
8435 list_splice_init(&root->delalloc_inodes, &splice); 8458 list_splice_init(&root->delalloc_inodes, &splice);
8436 while (!list_empty(&splice)) { 8459 while (!list_empty(&splice)) {
@@ -8453,12 +8476,14 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8453 else 8476 else
8454 iput(inode); 8477 iput(inode);
8455 ret = -ENOMEM; 8478 ret = -ENOMEM;
8456 goto out; 8479 break;
8457 } 8480 }
8458 list_add_tail(&work->list, &works); 8481 list_add_tail(&work->list, &works);
8459 btrfs_queue_worker(&root->fs_info->flush_workers, 8482 btrfs_queue_work(root->fs_info->flush_workers,
8460 &work->work); 8483 &work->work);
8461 8484 ret++;
8485 if (nr != -1 && ret >= nr)
8486 break;
8462 cond_resched(); 8487 cond_resched();
8463 spin_lock(&root->delalloc_lock); 8488 spin_lock(&root->delalloc_lock);
8464 } 8489 }
@@ -8468,18 +8493,13 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8468 list_del_init(&work->list); 8493 list_del_init(&work->list);
8469 btrfs_wait_and_free_delalloc_work(work); 8494 btrfs_wait_and_free_delalloc_work(work);
8470 } 8495 }
8471 return 0;
8472out:
8473 list_for_each_entry_safe(work, next, &works, list) {
8474 list_del_init(&work->list);
8475 btrfs_wait_and_free_delalloc_work(work);
8476 }
8477 8496
8478 if (!list_empty_careful(&splice)) { 8497 if (!list_empty_careful(&splice)) {
8479 spin_lock(&root->delalloc_lock); 8498 spin_lock(&root->delalloc_lock);
8480 list_splice_tail(&splice, &root->delalloc_inodes); 8499 list_splice_tail(&splice, &root->delalloc_inodes);
8481 spin_unlock(&root->delalloc_lock); 8500 spin_unlock(&root->delalloc_lock);
8482 } 8501 }
8502 mutex_unlock(&root->delalloc_mutex);
8483 return ret; 8503 return ret;
8484} 8504}
8485 8505
@@ -8490,7 +8510,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8490 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 8510 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8491 return -EROFS; 8511 return -EROFS;
8492 8512
8493 ret = __start_delalloc_inodes(root, delay_iput); 8513 ret = __start_delalloc_inodes(root, delay_iput, -1);
8514 if (ret > 0)
8515 ret = 0;
8494 /* 8516 /*
8495 * the filemap_flush will queue IO into the worker threads, but 8517 * the filemap_flush will queue IO into the worker threads, but
8496 * we have to make sure the IO is actually started and that 8518 * we have to make sure the IO is actually started and that
@@ -8507,7 +8529,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8507 return ret; 8529 return ret;
8508} 8530}
8509 8531
8510int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) 8532int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8533 int nr)
8511{ 8534{
8512 struct btrfs_root *root; 8535 struct btrfs_root *root;
8513 struct list_head splice; 8536 struct list_head splice;
@@ -8518,9 +8541,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8518 8541
8519 INIT_LIST_HEAD(&splice); 8542 INIT_LIST_HEAD(&splice);
8520 8543
8544 mutex_lock(&fs_info->delalloc_root_mutex);
8521 spin_lock(&fs_info->delalloc_root_lock); 8545 spin_lock(&fs_info->delalloc_root_lock);
8522 list_splice_init(&fs_info->delalloc_roots, &splice); 8546 list_splice_init(&fs_info->delalloc_roots, &splice);
8523 while (!list_empty(&splice)) { 8547 while (!list_empty(&splice) && nr) {
8524 root = list_first_entry(&splice, struct btrfs_root, 8548 root = list_first_entry(&splice, struct btrfs_root,
8525 delalloc_root); 8549 delalloc_root);
8526 root = btrfs_grab_fs_root(root); 8550 root = btrfs_grab_fs_root(root);
@@ -8529,15 +8553,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8529 &fs_info->delalloc_roots); 8553 &fs_info->delalloc_roots);
8530 spin_unlock(&fs_info->delalloc_root_lock); 8554 spin_unlock(&fs_info->delalloc_root_lock);
8531 8555
8532 ret = __start_delalloc_inodes(root, delay_iput); 8556 ret = __start_delalloc_inodes(root, delay_iput, nr);
8533 btrfs_put_fs_root(root); 8557 btrfs_put_fs_root(root);
8534 if (ret) 8558 if (ret < 0)
8535 goto out; 8559 goto out;
8536 8560
8561 if (nr != -1) {
8562 nr -= ret;
8563 WARN_ON(nr < 0);
8564 }
8537 spin_lock(&fs_info->delalloc_root_lock); 8565 spin_lock(&fs_info->delalloc_root_lock);
8538 } 8566 }
8539 spin_unlock(&fs_info->delalloc_root_lock); 8567 spin_unlock(&fs_info->delalloc_root_lock);
8540 8568
8569 ret = 0;
8541 atomic_inc(&fs_info->async_submit_draining); 8570 atomic_inc(&fs_info->async_submit_draining);
8542 while (atomic_read(&fs_info->nr_async_submits) || 8571 while (atomic_read(&fs_info->nr_async_submits) ||
8543 atomic_read(&fs_info->async_delalloc_pages)) { 8572 atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8575,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8546 atomic_read(&fs_info->async_delalloc_pages) == 0)); 8575 atomic_read(&fs_info->async_delalloc_pages) == 0));
8547 } 8576 }
8548 atomic_dec(&fs_info->async_submit_draining); 8577 atomic_dec(&fs_info->async_submit_draining);
8549 return 0;
8550out: 8578out:
8551 if (!list_empty_careful(&splice)) { 8579 if (!list_empty_careful(&splice)) {
8552 spin_lock(&fs_info->delalloc_root_lock); 8580 spin_lock(&fs_info->delalloc_root_lock);
8553 list_splice_tail(&splice, &fs_info->delalloc_roots); 8581 list_splice_tail(&splice, &fs_info->delalloc_roots);
8554 spin_unlock(&fs_info->delalloc_root_lock); 8582 spin_unlock(&fs_info->delalloc_root_lock);
8555 } 8583 }
8584 mutex_unlock(&fs_info->delalloc_root_mutex);
8556 return ret; 8585 return ret;
8557} 8586}
8558 8587
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..0401397b5c92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61 61
62#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
64 * structures are incorrect, as the timespec structure from userspace
65 * is 4 bytes too small. We define these alternatives here to teach
66 * the kernel about the 32-bit struct packing.
67 */
68struct btrfs_ioctl_timespec_32 {
69 __u64 sec;
70 __u32 nsec;
71} __attribute__ ((__packed__));
72
73struct btrfs_ioctl_received_subvol_args_32 {
74 char uuid[BTRFS_UUID_SIZE]; /* in */
75 __u64 stransid; /* in */
76 __u64 rtransid; /* out */
77 struct btrfs_ioctl_timespec_32 stime; /* in */
78 struct btrfs_ioctl_timespec_32 rtime; /* out */
79 __u64 flags; /* in */
80 __u64 reserved[16]; /* in */
81} __attribute__ ((__packed__));
82
83#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
84 struct btrfs_ioctl_received_subvol_args_32)
85#endif
86
87
62static int btrfs_clone(struct inode *src, struct inode *inode, 88static int btrfs_clone(struct inode *src, struct inode *inode,
63 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 89 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
64 90
@@ -585,6 +611,23 @@ fail:
585 return ret; 611 return ret;
586} 612}
587 613
614static void btrfs_wait_nocow_write(struct btrfs_root *root)
615{
616 s64 writers;
617 DEFINE_WAIT(wait);
618
619 do {
620 prepare_to_wait(&root->subv_writers->wait, &wait,
621 TASK_UNINTERRUPTIBLE);
622
623 writers = percpu_counter_sum(&root->subv_writers->counter);
624 if (writers)
625 schedule();
626
627 finish_wait(&root->subv_writers->wait, &wait);
628 } while (writers);
629}
630
588static int create_snapshot(struct btrfs_root *root, struct inode *dir, 631static int create_snapshot(struct btrfs_root *root, struct inode *dir,
589 struct dentry *dentry, char *name, int namelen, 632 struct dentry *dentry, char *name, int namelen,
590 u64 *async_transid, bool readonly, 633 u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
598 if (!root->ref_cows) 641 if (!root->ref_cows)
599 return -EINVAL; 642 return -EINVAL;
600 643
644 atomic_inc(&root->will_be_snapshoted);
645 smp_mb__after_atomic_inc();
646 btrfs_wait_nocow_write(root);
647
601 ret = btrfs_start_delalloc_inodes(root, 0); 648 ret = btrfs_start_delalloc_inodes(root, 0);
602 if (ret) 649 if (ret)
603 return ret; 650 goto out;
604 651
605 btrfs_wait_ordered_extents(root, -1); 652 btrfs_wait_ordered_extents(root, -1);
606 653
607 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 654 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
608 if (!pending_snapshot) 655 if (!pending_snapshot) {
609 return -ENOMEM; 656 ret = -ENOMEM;
657 goto out;
658 }
610 659
611 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 660 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
612 BTRFS_BLOCK_RSV_TEMP); 661 BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
623 &pending_snapshot->qgroup_reserved, 672 &pending_snapshot->qgroup_reserved,
624 false); 673 false);
625 if (ret) 674 if (ret)
626 goto out; 675 goto free;
627 676
628 pending_snapshot->dentry = dentry; 677 pending_snapshot->dentry = dentry;
629 pending_snapshot->root = root; 678 pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
674 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 723 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
675 &pending_snapshot->block_rsv, 724 &pending_snapshot->block_rsv,
676 pending_snapshot->qgroup_reserved); 725 pending_snapshot->qgroup_reserved);
677out: 726free:
678 kfree(pending_snapshot); 727 kfree(pending_snapshot);
728out:
729 atomic_dec(&root->will_be_snapshoted);
679 return ret; 730 return ret;
680} 731}
681 732
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
884 min_key.type = BTRFS_EXTENT_DATA_KEY; 935 min_key.type = BTRFS_EXTENT_DATA_KEY;
885 min_key.offset = *off; 936 min_key.offset = *off;
886 937
887 path->keep_locks = 1;
888
889 while (1) { 938 while (1) {
939 path->keep_locks = 1;
890 ret = btrfs_search_forward(root, &min_key, path, newer_than); 940 ret = btrfs_search_forward(root, &min_key, path, newer_than);
891 if (ret != 0) 941 if (ret != 0)
892 goto none; 942 goto none;
943 path->keep_locks = 0;
944 btrfs_unlock_up_safe(path, 1);
945process_slot:
893 if (min_key.objectid != ino) 946 if (min_key.objectid != ino)
894 goto none; 947 goto none;
895 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 948 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
908 return 0; 961 return 0;
909 } 962 }
910 963
964 path->slots[0]++;
965 if (path->slots[0] < btrfs_header_nritems(leaf)) {
966 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
967 goto process_slot;
968 }
969
911 if (min_key.offset == (u64)-1) 970 if (min_key.offset == (u64)-1)
912 goto none; 971 goto none;
913 972
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
935 read_unlock(&em_tree->lock); 994 read_unlock(&em_tree->lock);
936 995
937 if (!em) { 996 if (!em) {
997 struct extent_state *cached = NULL;
998 u64 end = start + len - 1;
999
938 /* get the big lock and read metadata off disk */ 1000 /* get the big lock and read metadata off disk */
939 lock_extent(io_tree, start, start + len - 1); 1001 lock_extent_bits(io_tree, start, end, 0, &cached);
940 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1002 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
941 unlock_extent(io_tree, start, start + len - 1); 1003 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
942 1004
943 if (IS_ERR(em)) 1005 if (IS_ERR(em))
944 return NULL; 1006 return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
957 return false; 1019 return false;
958 1020
959 next = defrag_lookup_extent(inode, em->start + em->len); 1021 next = defrag_lookup_extent(inode, em->start + em->len);
960 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
1023 (em->block_start + em->block_len == next->block_start))
961 ret = false; 1024 ret = false;
962 1025
963 free_extent_map(next); 1026 free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
1076 page_start = page_offset(page); 1139 page_start = page_offset(page);
1077 page_end = page_start + PAGE_CACHE_SIZE - 1; 1140 page_end = page_start + PAGE_CACHE_SIZE - 1;
1078 while (1) { 1141 while (1) {
1079 lock_extent(tree, page_start, page_end); 1142 lock_extent_bits(tree, page_start, page_end,
1143 0, &cached_state);
1080 ordered = btrfs_lookup_ordered_extent(inode, 1144 ordered = btrfs_lookup_ordered_extent(inode,
1081 page_start); 1145 page_start);
1082 unlock_extent(tree, page_start, page_end); 1146 unlock_extent_cached(tree, page_start, page_end,
1147 &cached_state, GFP_NOFS);
1083 if (!ordered) 1148 if (!ordered)
1084 break; 1149 break;
1085 1150
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1356 } 1421 }
1357 } 1422 }
1358 1423
1359 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1424 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
1360 filemap_flush(inode->i_mapping); 1425 filemap_flush(inode->i_mapping);
1426 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1427 &BTRFS_I(inode)->runtime_flags))
1428 filemap_flush(inode->i_mapping);
1429 }
1361 1430
1362 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1431 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1363 /* the filemap_flush will queue IO into the worker threads, but 1432 /* the filemap_flush will queue IO into the worker threads, but
@@ -1573,7 +1642,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1573 if (src_inode->i_sb != file_inode(file)->i_sb) { 1642 if (src_inode->i_sb != file_inode(file)->i_sb) {
1574 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1643 btrfs_info(BTRFS_I(src_inode)->root->fs_info,
1575 "Snapshot src from another FS"); 1644 "Snapshot src from another FS");
1576 ret = -EINVAL; 1645 ret = -EXDEV;
1577 } else if (!inode_owner_or_capable(src_inode)) { 1646 } else if (!inode_owner_or_capable(src_inode)) {
1578 /* 1647 /*
1579 * Subvolume creation is not restricted, but snapshots 1648 * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1866,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
1797 if (di && !IS_ERR(di)) { 1866 if (di && !IS_ERR(di)) {
1798 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1867 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1799 if (key.objectid == root->root_key.objectid) { 1868 if (key.objectid == root->root_key.objectid) {
1800 ret = -ENOTEMPTY; 1869 ret = -EPERM;
1870 btrfs_err(root->fs_info, "deleting default subvolume "
1871 "%llu is not allowed", key.objectid);
1801 goto out; 1872 goto out;
1802 } 1873 }
1803 btrfs_release_path(path); 1874 btrfs_release_path(path);
@@ -2994,8 +3065,9 @@ process_slot:
2994 new_key.offset + datal, 3065 new_key.offset + datal,
2995 1); 3066 1);
2996 if (ret) { 3067 if (ret) {
2997 btrfs_abort_transaction(trans, root, 3068 if (ret != -EINVAL)
2998 ret); 3069 btrfs_abort_transaction(trans,
3070 root, ret);
2999 btrfs_end_transaction(trans, root); 3071 btrfs_end_transaction(trans, root);
3000 goto out; 3072 goto out;
3001 } 3073 }
@@ -3153,8 +3225,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3153 * decompress into destination's address_space (the file offset 3225 * decompress into destination's address_space (the file offset
3154 * may change, so source mapping won't do), then recompress (or 3226 * may change, so source mapping won't do), then recompress (or
3155 * otherwise reinsert) a subrange. 3227 * otherwise reinsert) a subrange.
3156 * - allow ranges within the same file to be cloned (provided 3228 *
3157 * they don't overlap)? 3229 * - split destination inode's inline extents. The inline extents can
3230 * be either compressed or non-compressed.
3158 */ 3231 */
3159 3232
3160 /* the destination must be opened for writing */ 3233 /* the destination must be opened for writing */
@@ -4353,10 +4426,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
4353 return btrfs_qgroup_wait_for_completion(root->fs_info); 4426 return btrfs_qgroup_wait_for_completion(root->fs_info);
4354} 4427}
4355 4428
4356static long btrfs_ioctl_set_received_subvol(struct file *file, 4429static long _btrfs_ioctl_set_received_subvol(struct file *file,
4357 void __user *arg) 4430 struct btrfs_ioctl_received_subvol_args *sa)
4358{ 4431{
4359 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4360 struct inode *inode = file_inode(file); 4432 struct inode *inode = file_inode(file);
4361 struct btrfs_root *root = BTRFS_I(inode)->root; 4433 struct btrfs_root *root = BTRFS_I(inode)->root;
4362 struct btrfs_root_item *root_item = &root->root_item; 4434 struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4456,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4384 goto out; 4456 goto out;
4385 } 4457 }
4386 4458
4387 sa = memdup_user(arg, sizeof(*sa));
4388 if (IS_ERR(sa)) {
4389 ret = PTR_ERR(sa);
4390 sa = NULL;
4391 goto out;
4392 }
4393
4394 /* 4459 /*
4395 * 1 - root item 4460 * 1 - root item
4396 * 2 - uuid items (received uuid + subvol uuid) 4461 * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4509,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4444 goto out; 4509 goto out;
4445 } 4510 }
4446 4511
4512out:
4513 up_write(&root->fs_info->subvol_sem);
4514 mnt_drop_write_file(file);
4515 return ret;
4516}
4517
4518#ifdef CONFIG_64BIT
4519static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4520 void __user *arg)
4521{
4522 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4523 struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4524 int ret = 0;
4525
4526 args32 = memdup_user(arg, sizeof(*args32));
4527 if (IS_ERR(args32)) {
4528 ret = PTR_ERR(args32);
4529 args32 = NULL;
4530 goto out;
4531 }
4532
4533 args64 = kmalloc(sizeof(*args64), GFP_NOFS);
4534 if (IS_ERR(args64)) {
4535 ret = PTR_ERR(args64);
4536 args64 = NULL;
4537 goto out;
4538 }
4539
4540 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4541 args64->stransid = args32->stransid;
4542 args64->rtransid = args32->rtransid;
4543 args64->stime.sec = args32->stime.sec;
4544 args64->stime.nsec = args32->stime.nsec;
4545 args64->rtime.sec = args32->rtime.sec;
4546 args64->rtime.nsec = args32->rtime.nsec;
4547 args64->flags = args32->flags;
4548
4549 ret = _btrfs_ioctl_set_received_subvol(file, args64);
4550 if (ret)
4551 goto out;
4552
4553 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4554 args32->stransid = args64->stransid;
4555 args32->rtransid = args64->rtransid;
4556 args32->stime.sec = args64->stime.sec;
4557 args32->stime.nsec = args64->stime.nsec;
4558 args32->rtime.sec = args64->rtime.sec;
4559 args32->rtime.nsec = args64->rtime.nsec;
4560 args32->flags = args64->flags;
4561
4562 ret = copy_to_user(arg, args32, sizeof(*args32));
4563 if (ret)
4564 ret = -EFAULT;
4565
4566out:
4567 kfree(args32);
4568 kfree(args64);
4569 return ret;
4570}
4571#endif
4572
4573static long btrfs_ioctl_set_received_subvol(struct file *file,
4574 void __user *arg)
4575{
4576 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4577 int ret = 0;
4578
4579 sa = memdup_user(arg, sizeof(*sa));
4580 if (IS_ERR(sa)) {
4581 ret = PTR_ERR(sa);
4582 sa = NULL;
4583 goto out;
4584 }
4585
4586 ret = _btrfs_ioctl_set_received_subvol(file, sa);
4587
4588 if (ret)
4589 goto out;
4590
4447 ret = copy_to_user(arg, sa, sizeof(*sa)); 4591 ret = copy_to_user(arg, sa, sizeof(*sa));
4448 if (ret) 4592 if (ret)
4449 ret = -EFAULT; 4593 ret = -EFAULT;
4450 4594
4451out: 4595out:
4452 kfree(sa); 4596 kfree(sa);
4453 up_write(&root->fs_info->subvol_sem);
4454 mnt_drop_write_file(file);
4455 return ret; 4597 return ret;
4456} 4598}
4457 4599
@@ -4746,7 +4888,7 @@ long btrfs_ioctl(struct file *file, unsigned int
4746 case BTRFS_IOC_SYNC: { 4888 case BTRFS_IOC_SYNC: {
4747 int ret; 4889 int ret;
4748 4890
4749 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 4891 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
4750 if (ret) 4892 if (ret)
4751 return ret; 4893 return ret;
4752 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 4894 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4912,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4770 return btrfs_ioctl_balance_progress(root, argp); 4912 return btrfs_ioctl_balance_progress(root, argp);
4771 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 4913 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
4772 return btrfs_ioctl_set_received_subvol(file, argp); 4914 return btrfs_ioctl_set_received_subvol(file, argp);
4915#ifdef CONFIG_64BIT
4916 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
4917 return btrfs_ioctl_set_received_subvol_32(file, argp);
4918#endif
4773 case BTRFS_IOC_SEND: 4919 case BTRFS_IOC_SEND:
4774 return btrfs_ioctl_send(file, argp); 4920 return btrfs_ioctl_send(file, argp);
4775 case BTRFS_IOC_GET_DEV_STATS: 4921 case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
349 if (!uptodate) 349 if (!uptodate)
350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
351 351
352 if (entry->bytes_left == 0) 352 if (entry->bytes_left == 0) {
353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 else 354 if (waitqueue_active(&entry->wait))
355 wake_up(&entry->wait);
356 } else {
355 ret = 1; 357 ret = 1;
358 }
356out: 359out:
357 if (!ret && cached && entry) { 360 if (!ret && cached && entry) {
358 *cached = entry; 361 *cached = entry;
@@ -410,10 +413,13 @@ have_entry:
410 if (!uptodate) 413 if (!uptodate)
411 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 414 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
412 415
413 if (entry->bytes_left == 0) 416 if (entry->bytes_left == 0) {
414 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 417 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 else 418 if (waitqueue_active(&entry->wait))
419 wake_up(&entry->wait);
420 } else {
416 ret = 1; 421 ret = 1;
422 }
417out: 423out:
418 if (!ret && cached && entry) { 424 if (!ret && cached && entry) {
419 *cached = entry; 425 *cached = entry;
@@ -424,27 +430,48 @@ out:
424} 430}
425 431
426/* Needs to either be called under a log transaction or the log_mutex */ 432/* Needs to either be called under a log transaction or the log_mutex */
427void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) 433void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list)
428{ 435{
429 struct btrfs_ordered_inode_tree *tree; 436 struct btrfs_ordered_inode_tree *tree;
430 struct btrfs_ordered_extent *ordered; 437 struct btrfs_ordered_extent *ordered;
431 struct rb_node *n; 438 struct rb_node *n;
432 int index = log->log_transid % 2;
433 439
434 tree = &BTRFS_I(inode)->ordered_tree; 440 tree = &BTRFS_I(inode)->ordered_tree;
435 spin_lock_irq(&tree->lock); 441 spin_lock_irq(&tree->lock);
436 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
437 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
438 spin_lock(&log->log_extents_lock[index]); 444 if (!list_empty(&ordered->log_list))
439 if (list_empty(&ordered->log_list)) { 445 continue;
440 list_add_tail(&ordered->log_list, &log->logged_list[index]); 446 list_add_tail(&ordered->log_list, logged_list);
441 atomic_inc(&ordered->refs); 447 atomic_inc(&ordered->refs);
442 }
443 spin_unlock(&log->log_extents_lock[index]);
444 } 448 }
445 spin_unlock_irq(&tree->lock); 449 spin_unlock_irq(&tree->lock);
446} 450}
447 451
452void btrfs_put_logged_extents(struct list_head *logged_list)
453{
454 struct btrfs_ordered_extent *ordered;
455
456 while (!list_empty(logged_list)) {
457 ordered = list_first_entry(logged_list,
458 struct btrfs_ordered_extent,
459 log_list);
460 list_del_init(&ordered->log_list);
461 btrfs_put_ordered_extent(ordered);
462 }
463}
464
465void btrfs_submit_logged_extents(struct list_head *logged_list,
466 struct btrfs_root *log)
467{
468 int index = log->log_transid % 2;
469
470 spin_lock_irq(&log->log_extents_lock[index]);
471 list_splice_tail(logged_list, &log->logged_list[index]);
472 spin_unlock_irq(&log->log_extents_lock[index]);
473}
474
448void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
449{ 476{
450 struct btrfs_ordered_extent *ordered; 477 struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
577 INIT_LIST_HEAD(&splice); 604 INIT_LIST_HEAD(&splice);
578 INIT_LIST_HEAD(&works); 605 INIT_LIST_HEAD(&works);
579 606
580 mutex_lock(&root->fs_info->ordered_operations_mutex); 607 mutex_lock(&root->ordered_extent_mutex);
581 spin_lock(&root->ordered_extent_lock); 608 spin_lock(&root->ordered_extent_lock);
582 list_splice_init(&root->ordered_extents, &splice); 609 list_splice_init(&root->ordered_extents, &splice);
583 while (!list_empty(&splice) && nr) { 610 while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
588 atomic_inc(&ordered->refs); 615 atomic_inc(&ordered->refs);
589 spin_unlock(&root->ordered_extent_lock); 616 spin_unlock(&root->ordered_extent_lock);
590 617
591 ordered->flush_work.func = btrfs_run_ordered_extent_work; 618 btrfs_init_work(&ordered->flush_work,
619 btrfs_run_ordered_extent_work, NULL, NULL);
592 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
593 btrfs_queue_worker(&root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
594 &ordered->flush_work); 622 &ordered->flush_work);
595 623
596 cond_resched(); 624 cond_resched();
597 spin_lock(&root->ordered_extent_lock); 625 spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
608 btrfs_put_ordered_extent(ordered); 636 btrfs_put_ordered_extent(ordered);
609 cond_resched(); 637 cond_resched();
610 } 638 }
611 mutex_unlock(&root->fs_info->ordered_operations_mutex); 639 mutex_unlock(&root->ordered_extent_mutex);
612 640
613 return count; 641 return count;
614} 642}
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
621 649
622 INIT_LIST_HEAD(&splice); 650 INIT_LIST_HEAD(&splice);
623 651
652 mutex_lock(&fs_info->ordered_operations_mutex);
624 spin_lock(&fs_info->ordered_root_lock); 653 spin_lock(&fs_info->ordered_root_lock);
625 list_splice_init(&fs_info->ordered_roots, &splice); 654 list_splice_init(&fs_info->ordered_roots, &splice);
626 while (!list_empty(&splice) && nr) { 655 while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
643 } 672 }
644 list_splice_tail(&splice, &fs_info->ordered_roots); 673 list_splice_tail(&splice, &fs_info->ordered_roots);
645 spin_unlock(&fs_info->ordered_root_lock); 674 spin_unlock(&fs_info->ordered_root_lock);
675 mutex_unlock(&fs_info->ordered_operations_mutex);
646} 676}
647 677
648/* 678/*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
704 goto out; 734 goto out;
705 } 735 }
706 list_add_tail(&work->list, &works); 736 list_add_tail(&work->list, &works);
707 btrfs_queue_worker(&root->fs_info->flush_workers, 737 btrfs_queue_work(root->fs_info->flush_workers,
708 &work->work); 738 &work->work);
709 739
710 cond_resched(); 740 cond_resched();
711 spin_lock(&root->fs_info->ordered_root_lock); 741 spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
197 struct inode *inode); 197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 200void btrfs_get_logged_extents(struct inode *inode,
201 struct list_head *logged_list);
202void btrfs_put_logged_extents(struct list_head *logged_list);
203void btrfs_submit_logged_extents(struct list_head *logged_list,
204 struct btrfs_root *log);
201void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 205void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
202void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 206void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
203int __init ordered_data_init(void); 207int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1509 ret = qgroup_rescan_init(fs_info, 0, 1); 1509 ret = qgroup_rescan_init(fs_info, 0, 1);
1510 if (!ret) { 1510 if (!ret) {
1511 qgroup_rescan_zero_tracking(fs_info); 1511 qgroup_rescan_zero_tracking(fs_info);
1512 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 1512 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1513 &fs_info->qgroup_rescan_work); 1513 &fs_info->qgroup_rescan_work);
1514 } 1514 }
1515 ret = 0; 1515 ret = 0;
1516 } 1516 }
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2095 2095
2096 memset(&fs_info->qgroup_rescan_work, 0, 2096 memset(&fs_info->qgroup_rescan_work, 0,
2097 sizeof(fs_info->qgroup_rescan_work)); 2097 sizeof(fs_info->qgroup_rescan_work));
2098 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; 2098 btrfs_init_work(&fs_info->qgroup_rescan_work,
2099 btrfs_qgroup_rescan_worker, NULL, NULL);
2099 2100
2100 if (ret) { 2101 if (ret) {
2101err: 2102err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2158 2159
2159 qgroup_rescan_zero_tracking(fs_info); 2160 qgroup_rescan_zero_tracking(fs_info);
2160 2161
2161 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2162 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2162 &fs_info->qgroup_rescan_work); 2163 &fs_info->qgroup_rescan_work);
2163 2164
2164 return 0; 2165 return 0;
2165} 2166}
@@ -2190,6 +2191,6 @@ void
2190btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 2191btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2191{ 2192{
2192 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2193 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2193 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2194 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2194 &fs_info->qgroup_rescan_work); 2195 &fs_info->qgroup_rescan_work);
2195} 2196}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 rbio->work.flags = 0; 1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
1420 rbio->work.func = rmw_work;
1421 1420
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1421 btrfs_queue_work(rbio->fs_info->rmw_workers,
1423 &rbio->work); 1422 &rbio->work);
1424} 1423}
1425 1424
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1425static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{ 1426{
1428 rbio->work.flags = 0; 1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
1429 rbio->work.func = read_rebuild_work;
1430 1428
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1429 btrfs_queue_work(rbio->fs_info->rmw_workers,
1432 &rbio->work); 1430 &rbio->work);
1433} 1431}
1434 1432
1435/* 1433/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1667 plug = container_of(cb, struct btrfs_plug_cb, cb); 1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1668 1666
1669 if (from_schedule) { 1667 if (from_schedule) {
1670 plug->work.flags = 0; 1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1671 plug->work.func = unplug_work; 1669 btrfs_queue_work(plug->info->rmw_workers,
1672 btrfs_queue_worker(&plug->info->rmw_workers, 1670 &plug->work);
1673 &plug->work);
1674 return; 1671 return;
1675 } 1672 }
1676 run_plug(plug); 1673 run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
793 /* FIXME we cannot handle this properly right now */ 793 /* FIXME we cannot handle this properly right now */
794 BUG(); 794 BUG();
795 } 795 }
796 rmw->work.func = reada_start_machine_worker; 796 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
797 rmw->fs_info = fs_info; 797 rmw->fs_info = fs_info;
798 798
799 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); 799 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
800} 800}
801 801
802#ifdef DEBUG 802#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..def428a25b2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4249 rc->block_group->key.objectid, rc->block_group->flags); 4249 rc->block_group->key.objectid, rc->block_group->flags);
4250 4250
4251 ret = btrfs_start_delalloc_roots(fs_info, 0); 4251 ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4252 if (ret < 0) { 4252 if (ret < 0) {
4253 err = ret; 4253 err = ret;
4254 goto out; 4254 goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/err.h>
19#include <linux/uuid.h> 20#include <linux/uuid.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
271 key.offset++; 272 key.offset++;
272 273
273 root = btrfs_read_fs_root(tree_root, &root_key); 274 root = btrfs_read_fs_root(tree_root, &root_key);
274 err = PTR_RET(root); 275 err = PTR_ERR_OR_ZERO(root);
275 if (err && err != -ENOENT) { 276 if (err && err != -ENOENT) {
276 break; 277 break;
277 } else if (err == -ENOENT) { 278 } else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..93e6d7172844 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
315 atomic_inc(&fs_info->scrubs_running); 315 atomic_inc(&fs_info->scrubs_running);
316 atomic_inc(&fs_info->scrubs_paused); 316 atomic_inc(&fs_info->scrubs_paused);
317 mutex_unlock(&fs_info->scrub_lock); 317 mutex_unlock(&fs_info->scrub_lock);
318
319 /*
320 * check if @scrubs_running=@scrubs_paused condition
321 * inside wait_event() is not an atomic operation.
322 * which means we may inc/dec @scrub_running/paused
323 * at any time. Let's wake up @scrub_pause_wait as
324 * much as we can to let commit transaction blocked less.
325 */
326 wake_up(&fs_info->scrub_pause_wait);
327
318 atomic_inc(&sctx->workers_pending); 328 atomic_inc(&sctx->workers_pending);
319} 329}
320 330
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
418 sbio->index = i; 428 sbio->index = i;
419 sbio->sctx = sctx; 429 sbio->sctx = sctx;
420 sbio->page_count = 0; 430 sbio->page_count = 0;
421 sbio->work.func = scrub_bio_end_io_worker; 431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
432 NULL, NULL);
422 433
423 if (i != SCRUB_BIOS_PER_SCTX - 1) 434 if (i != SCRUB_BIOS_PER_SCTX - 1)
424 sctx->bios[i]->next_free = i + 1; 435 sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
987 fixup_nodatasum->root = fs_info->extent_root; 998 fixup_nodatasum->root = fs_info->extent_root;
988 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 999 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
989 scrub_pending_trans_workers_inc(sctx); 1000 scrub_pending_trans_workers_inc(sctx);
990 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 1001 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
991 btrfs_queue_worker(&fs_info->scrub_workers, 1002 NULL, NULL);
992 &fixup_nodatasum->work); 1003 btrfs_queue_work(fs_info->scrub_workers,
1004 &fixup_nodatasum->work);
993 goto out; 1005 goto out;
994 } 1006 }
995 1007
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1603 sbio->err = err; 1615 sbio->err = err;
1604 sbio->bio = bio; 1616 sbio->bio = bio;
1605 1617
1606 sbio->work.func = scrub_wr_bio_end_io_worker; 1618 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1607 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1619 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1608} 1620}
1609 1621
1610static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1622static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
2072 sbio->err = err; 2084 sbio->err = err;
2073 sbio->bio = bio; 2085 sbio->bio = bio;
2074 2086
2075 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 2087 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2076} 2088}
2077 2089
2078static void scrub_bio_end_io_worker(struct btrfs_work *work) 2090static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2686,10 +2698,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2686 2698
2687 wait_event(sctx->list_wait, 2699 wait_event(sctx->list_wait,
2688 atomic_read(&sctx->bios_in_flight) == 0); 2700 atomic_read(&sctx->bios_in_flight) == 0);
2689 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2701 atomic_inc(&fs_info->scrubs_paused);
2702 wake_up(&fs_info->scrub_pause_wait);
2703
2704 /*
2705 * must be called before we decrease @scrub_paused.
2706 * make sure we don't block transaction commit while
2707 * we are waiting pending workers finished.
2708 */
2690 wait_event(sctx->list_wait, 2709 wait_event(sctx->list_wait,
2691 atomic_read(&sctx->workers_pending) == 0); 2710 atomic_read(&sctx->workers_pending) == 0);
2692 scrub_blocked_if_needed(fs_info); 2711 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2712
2713 mutex_lock(&fs_info->scrub_lock);
2714 __scrub_blocked_if_needed(fs_info);
2715 atomic_dec(&fs_info->scrubs_paused);
2716 mutex_unlock(&fs_info->scrub_lock);
2717 wake_up(&fs_info->scrub_pause_wait);
2693 2718
2694 btrfs_put_block_group(cache); 2719 btrfs_put_block_group(cache);
2695 if (ret) 2720 if (ret)
@@ -2757,33 +2782,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2757 int is_dev_replace) 2782 int is_dev_replace)
2758{ 2783{
2759 int ret = 0; 2784 int ret = 0;
2785 int flags = WQ_FREEZABLE | WQ_UNBOUND;
2786 int max_active = fs_info->thread_pool_size;
2760 2787
2761 if (fs_info->scrub_workers_refcnt == 0) { 2788 if (fs_info->scrub_workers_refcnt == 0) {
2762 if (is_dev_replace) 2789 if (is_dev_replace)
2763 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2790 fs_info->scrub_workers =
2764 &fs_info->generic_worker); 2791 btrfs_alloc_workqueue("btrfs-scrub", flags,
2792 1, 4);
2765 else 2793 else
2766 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2794 fs_info->scrub_workers =
2767 fs_info->thread_pool_size, 2795 btrfs_alloc_workqueue("btrfs-scrub", flags,
2768 &fs_info->generic_worker); 2796 max_active, 4);
2769 fs_info->scrub_workers.idle_thresh = 4; 2797 if (!fs_info->scrub_workers) {
2770 ret = btrfs_start_workers(&fs_info->scrub_workers); 2798 ret = -ENOMEM;
2771 if (ret)
2772 goto out; 2799 goto out;
2773 btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2800 }
2774 "scrubwrc", 2801 fs_info->scrub_wr_completion_workers =
2775 fs_info->thread_pool_size, 2802 btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
2776 &fs_info->generic_worker); 2803 max_active, 2);
2777 fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2804 if (!fs_info->scrub_wr_completion_workers) {
2778 ret = btrfs_start_workers( 2805 ret = -ENOMEM;
2779 &fs_info->scrub_wr_completion_workers);
2780 if (ret)
2781 goto out; 2806 goto out;
2782 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2807 }
2783 &fs_info->generic_worker); 2808 fs_info->scrub_nocow_workers =
2784 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2809 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
2785 if (ret) 2810 if (!fs_info->scrub_nocow_workers) {
2811 ret = -ENOMEM;
2786 goto out; 2812 goto out;
2813 }
2787 } 2814 }
2788 ++fs_info->scrub_workers_refcnt; 2815 ++fs_info->scrub_workers_refcnt;
2789out: 2816out:
@@ -2793,9 +2820,9 @@ out:
2793static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2820static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2794{ 2821{
2795 if (--fs_info->scrub_workers_refcnt == 0) { 2822 if (--fs_info->scrub_workers_refcnt == 0) {
2796 btrfs_stop_workers(&fs_info->scrub_workers); 2823 btrfs_destroy_workqueue(fs_info->scrub_workers);
2797 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2824 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
2798 btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2825 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
2799 } 2826 }
2800 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2827 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2801} 2828}
@@ -3106,10 +3133,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3106 nocow_ctx->len = len; 3133 nocow_ctx->len = len;
3107 nocow_ctx->mirror_num = mirror_num; 3134 nocow_ctx->mirror_num = mirror_num;
3108 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3135 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3109 nocow_ctx->work.func = copy_nocow_pages_worker; 3136 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
3110 INIT_LIST_HEAD(&nocow_ctx->inodes); 3137 INIT_LIST_HEAD(&nocow_ctx->inodes);
3111 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3138 btrfs_queue_work(fs_info->scrub_nocow_workers,
3112 &nocow_ctx->work); 3139 &nocow_ctx->work);
3113 3140
3114 return 0; 3141 return 0;
3115} 3142}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..9b6da9d55f9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
51 struct { 51 struct {
52 char *start; 52 char *start;
53 char *end; 53 char *end;
54 char *prepared;
55 54
56 char *buf; 55 char *buf;
57 int buf_len; 56 unsigned short buf_len:15;
58 unsigned int reversed:1; 57 unsigned short reversed:1;
59 unsigned int virtual_mem:1;
60 char inline_buf[]; 58 char inline_buf[];
61 }; 59 };
62 char pad[PAGE_SIZE]; 60 /*
61 * Average path length does not exceed 200 bytes, we'll have
62 * better packing in the slab and higher chance to satisfy
63 * a allocation later during send.
64 */
65 char pad[256];
63 }; 66 };
64}; 67};
65#define FS_PATH_INLINE_SIZE \ 68#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
109 int cur_inode_deleted; 112 int cur_inode_deleted;
110 u64 cur_inode_size; 113 u64 cur_inode_size;
111 u64 cur_inode_mode; 114 u64 cur_inode_mode;
115 u64 cur_inode_rdev;
112 u64 cur_inode_last_extent; 116 u64 cur_inode_last_extent;
113 117
114 u64 send_progress; 118 u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
120 struct list_head name_cache_list; 124 struct list_head name_cache_list;
121 int name_cache_size; 125 int name_cache_size;
122 126
127 struct file_ra_state ra;
128
123 char *read_buf; 129 char *read_buf;
124 130
125 /* 131 /*
@@ -175,6 +181,47 @@ struct send_ctx {
175 * own move/rename can be performed. 181 * own move/rename can be performed.
176 */ 182 */
177 struct rb_root waiting_dir_moves; 183 struct rb_root waiting_dir_moves;
184
185 /*
186 * A directory that is going to be rm'ed might have a child directory
187 * which is in the pending directory moves index above. In this case,
188 * the directory can only be removed after the move/rename of its child
189 * is performed. Example:
190 *
191 * Parent snapshot:
192 *
193 * . (ino 256)
194 * |-- a/ (ino 257)
195 * |-- b/ (ino 258)
196 * |-- c/ (ino 259)
197 * | |-- x/ (ino 260)
198 * |
199 * |-- y/ (ino 261)
200 *
201 * Send snapshot:
202 *
203 * . (ino 256)
204 * |-- a/ (ino 257)
205 * |-- b/ (ino 258)
206 * |-- YY/ (ino 261)
207 * |-- x/ (ino 260)
208 *
209 * Sequence of steps that lead to the send snapshot:
210 * rm -f /a/b/c/foo.txt
211 * mv /a/b/y /a/b/YY
212 * mv /a/b/c/x /a/b/YY
213 * rmdir /a/b/c
214 *
215 * When the child is processed, its move/rename is delayed until its
216 * parent is processed (as explained above), but all other operations
217 * like update utimes, chown, chgrp, etc, are performed and the paths
218 * that it uses for those operations must use the orphanized name of
219 * its parent (the directory we're going to rm later), so we need to
220 * memorize that name.
221 *
222 * Indexed by the inode number of the directory to be deleted.
223 */
224 struct rb_root orphan_dirs;
178}; 225};
179 226
180struct pending_dir_move { 227struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
189struct waiting_dir_move { 236struct waiting_dir_move {
190 struct rb_node node; 237 struct rb_node node;
191 u64 ino; 238 u64 ino;
239 /*
240 * There might be some directory that could not be removed because it
241 * was waiting for this directory inode to be moved first. Therefore
242 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
243 */
244 u64 rmdir_ino;
245};
246
247struct orphan_dir_info {
248 struct rb_node node;
249 u64 ino;
250 u64 gen;
192}; 251};
193 252
194struct name_cache_entry { 253struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
214 273
215static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 274static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
216 275
276static struct waiting_dir_move *
277get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
278
279static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
280
217static int need_send_hole(struct send_ctx *sctx) 281static int need_send_hole(struct send_ctx *sctx)
218{ 282{
219 return (sctx->parent_root && !sctx->cur_inode_new && 283 return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
242 if (!p) 306 if (!p)
243 return NULL; 307 return NULL;
244 p->reversed = 0; 308 p->reversed = 0;
245 p->virtual_mem = 0;
246 p->buf = p->inline_buf; 309 p->buf = p->inline_buf;
247 p->buf_len = FS_PATH_INLINE_SIZE; 310 p->buf_len = FS_PATH_INLINE_SIZE;
248 fs_path_reset(p); 311 fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
265{ 328{
266 if (!p) 329 if (!p)
267 return; 330 return;
268 if (p->buf != p->inline_buf) { 331 if (p->buf != p->inline_buf)
269 if (p->virtual_mem) 332 kfree(p->buf);
270 vfree(p->buf);
271 else
272 kfree(p->buf);
273 }
274 kfree(p); 333 kfree(p);
275} 334}
276 335
@@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
292 351
293 path_len = p->end - p->start; 352 path_len = p->end - p->start;
294 old_buf_len = p->buf_len; 353 old_buf_len = p->buf_len;
295 len = PAGE_ALIGN(len); 354
296 355 /*
297 if (p->buf == p->inline_buf) { 356 * First time the inline_buf does not suffice
298 tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); 357 */
299 if (!tmp_buf) { 358 if (p->buf == p->inline_buf)
300 tmp_buf = vmalloc(len); 359 tmp_buf = kmalloc(len, GFP_NOFS);
301 if (!tmp_buf) 360 else
302 return -ENOMEM; 361 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
303 p->virtual_mem = 1; 362 if (!tmp_buf)
304 } 363 return -ENOMEM;
305 memcpy(tmp_buf, p->buf, p->buf_len); 364 p->buf = tmp_buf;
306 p->buf = tmp_buf; 365 /*
307 p->buf_len = len; 366 * The real size of the buffer is bigger, this will let the fast path
308 } else { 367 * happen most of the time
309 if (p->virtual_mem) { 368 */
310 tmp_buf = vmalloc(len); 369 p->buf_len = ksize(p->buf);
311 if (!tmp_buf) 370
312 return -ENOMEM;
313 memcpy(tmp_buf, p->buf, p->buf_len);
314 vfree(p->buf);
315 } else {
316 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
317 if (!tmp_buf) {
318 tmp_buf = vmalloc(len);
319 if (!tmp_buf)
320 return -ENOMEM;
321 memcpy(tmp_buf, p->buf, p->buf_len);
322 kfree(p->buf);
323 p->virtual_mem = 1;
324 }
325 }
326 p->buf = tmp_buf;
327 p->buf_len = len;
328 }
329 if (p->reversed) { 371 if (p->reversed) {
330 tmp_buf = p->buf + old_buf_len - path_len - 1; 372 tmp_buf = p->buf + old_buf_len - path_len - 1;
331 p->end = p->buf + p->buf_len - 1; 373 p->end = p->buf + p->buf_len - 1;
@@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
338 return 0; 380 return 0;
339} 381}
340 382
341static int fs_path_prepare_for_add(struct fs_path *p, int name_len) 383static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
384 char **prepared)
342{ 385{
343 int ret; 386 int ret;
344 int new_len; 387 int new_len;
@@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
354 if (p->start != p->end) 397 if (p->start != p->end)
355 *--p->start = '/'; 398 *--p->start = '/';
356 p->start -= name_len; 399 p->start -= name_len;
357 p->prepared = p->start; 400 *prepared = p->start;
358 } else { 401 } else {
359 if (p->start != p->end) 402 if (p->start != p->end)
360 *p->end++ = '/'; 403 *p->end++ = '/';
361 p->prepared = p->end; 404 *prepared = p->end;
362 p->end += name_len; 405 p->end += name_len;
363 *p->end = 0; 406 *p->end = 0;
364 } 407 }
@@ -370,12 +413,12 @@ out:
370static int fs_path_add(struct fs_path *p, const char *name, int name_len) 413static int fs_path_add(struct fs_path *p, const char *name, int name_len)
371{ 414{
372 int ret; 415 int ret;
416 char *prepared;
373 417
374 ret = fs_path_prepare_for_add(p, name_len); 418 ret = fs_path_prepare_for_add(p, name_len, &prepared);
375 if (ret < 0) 419 if (ret < 0)
376 goto out; 420 goto out;
377 memcpy(p->prepared, name, name_len); 421 memcpy(prepared, name, name_len);
378 p->prepared = NULL;
379 422
380out: 423out:
381 return ret; 424 return ret;
@@ -384,12 +427,12 @@ out:
384static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 427static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
385{ 428{
386 int ret; 429 int ret;
430 char *prepared;
387 431
388 ret = fs_path_prepare_for_add(p, p2->end - p2->start); 432 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
389 if (ret < 0) 433 if (ret < 0)
390 goto out; 434 goto out;
391 memcpy(p->prepared, p2->start, p2->end - p2->start); 435 memcpy(prepared, p2->start, p2->end - p2->start);
392 p->prepared = NULL;
393 436
394out: 437out:
395 return ret; 438 return ret;
@@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
400 unsigned long off, int len) 443 unsigned long off, int len)
401{ 444{
402 int ret; 445 int ret;
446 char *prepared;
403 447
404 ret = fs_path_prepare_for_add(p, len); 448 ret = fs_path_prepare_for_add(p, len, &prepared);
405 if (ret < 0) 449 if (ret < 0)
406 goto out; 450 goto out;
407 451
408 read_extent_buffer(eb, p->prepared, off, len); 452 read_extent_buffer(eb, prepared, off, len);
409 p->prepared = NULL;
410 453
411out: 454out:
412 return ret; 455 return ret;
@@ -915,9 +958,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
915 struct btrfs_dir_item *di; 958 struct btrfs_dir_item *di;
916 struct btrfs_key di_key; 959 struct btrfs_key di_key;
917 char *buf = NULL; 960 char *buf = NULL;
918 char *buf2 = NULL; 961 const int buf_len = PATH_MAX;
919 int buf_len;
920 int buf_virtual = 0;
921 u32 name_len; 962 u32 name_len;
922 u32 data_len; 963 u32 data_len;
923 u32 cur; 964 u32 cur;
@@ -927,7 +968,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
927 int num; 968 int num;
928 u8 type; 969 u8 type;
929 970
930 buf_len = PAGE_SIZE;
931 buf = kmalloc(buf_len, GFP_NOFS); 971 buf = kmalloc(buf_len, GFP_NOFS);
932 if (!buf) { 972 if (!buf) {
933 ret = -ENOMEM; 973 ret = -ENOMEM;
@@ -949,30 +989,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
949 type = btrfs_dir_type(eb, di); 989 type = btrfs_dir_type(eb, di);
950 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 990 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
951 991
992 /*
993 * Path too long
994 */
952 if (name_len + data_len > buf_len) { 995 if (name_len + data_len > buf_len) {
953 buf_len = PAGE_ALIGN(name_len + data_len); 996 ret = -ENAMETOOLONG;
954 if (buf_virtual) { 997 goto out;
955 buf2 = vmalloc(buf_len);
956 if (!buf2) {
957 ret = -ENOMEM;
958 goto out;
959 }
960 vfree(buf);
961 } else {
962 buf2 = krealloc(buf, buf_len, GFP_NOFS);
963 if (!buf2) {
964 buf2 = vmalloc(buf_len);
965 if (!buf2) {
966 ret = -ENOMEM;
967 goto out;
968 }
969 kfree(buf);
970 buf_virtual = 1;
971 }
972 }
973
974 buf = buf2;
975 buf2 = NULL;
976 } 998 }
977 999
978 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1000 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1017,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 } 1017 }
996 1018
997out: 1019out:
998 if (buf_virtual) 1020 kfree(buf);
999 vfree(buf);
1000 else
1001 kfree(buf);
1002 return ret; 1021 return ret;
1003} 1022}
1004 1023
@@ -1292,8 +1311,6 @@ static int find_extent_clone(struct send_ctx *sctx,
1292 extent_item_pos = logical - found_key.objectid; 1311 extent_item_pos = logical - found_key.objectid;
1293 else 1312 else
1294 extent_item_pos = 0; 1313 extent_item_pos = 0;
1295
1296 extent_item_pos = logical - found_key.objectid;
1297 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1314 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1298 found_key.objectid, extent_item_pos, 1, 1315 found_key.objectid, extent_item_pos, 1,
1299 __iterate_backrefs, backref_ctx); 1316 __iterate_backrefs, backref_ctx);
@@ -1418,11 +1435,7 @@ static int gen_unique_name(struct send_ctx *sctx,
1418 while (1) { 1435 while (1) {
1419 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1436 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1420 ino, gen, idx); 1437 ino, gen, idx);
1421 if (len >= sizeof(tmp)) { 1438 ASSERT(len < sizeof(tmp));
1422 /* should really not happen */
1423 ret = -EOVERFLOW;
1424 goto out;
1425 }
1426 1439
1427 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1440 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1428 path, BTRFS_FIRST_FREE_OBJECTID, 1441 path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1898,13 +1911,20 @@ static void name_cache_delete(struct send_ctx *sctx,
1898 1911
1899 nce_head = radix_tree_lookup(&sctx->name_cache, 1912 nce_head = radix_tree_lookup(&sctx->name_cache,
1900 (unsigned long)nce->ino); 1913 (unsigned long)nce->ino);
1901 BUG_ON(!nce_head); 1914 if (!nce_head) {
1915 btrfs_err(sctx->send_root->fs_info,
1916 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
1917 nce->ino, sctx->name_cache_size);
1918 }
1902 1919
1903 list_del(&nce->radix_list); 1920 list_del(&nce->radix_list);
1904 list_del(&nce->list); 1921 list_del(&nce->list);
1905 sctx->name_cache_size--; 1922 sctx->name_cache_size--;
1906 1923
1907 if (list_empty(nce_head)) { 1924 /*
1925 * We may not get to the final release of nce_head if the lookup fails
1926 */
1927 if (nce_head && list_empty(nce_head)) {
1908 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 1928 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1909 kfree(nce_head); 1929 kfree(nce_head);
1910 } 1930 }
@@ -1977,7 +1997,6 @@ static void name_cache_free(struct send_ctx *sctx)
1977 */ 1997 */
1978static int __get_cur_name_and_parent(struct send_ctx *sctx, 1998static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 u64 ino, u64 gen, 1999 u64 ino, u64 gen,
1980 int skip_name_cache,
1981 u64 *parent_ino, 2000 u64 *parent_ino,
1982 u64 *parent_gen, 2001 u64 *parent_gen,
1983 struct fs_path *dest) 2002 struct fs_path *dest)
@@ -1987,8 +2006,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1987 struct btrfs_path *path = NULL; 2006 struct btrfs_path *path = NULL;
1988 struct name_cache_entry *nce = NULL; 2007 struct name_cache_entry *nce = NULL;
1989 2008
1990 if (skip_name_cache)
1991 goto get_ref;
1992 /* 2009 /*
1993 * First check if we already did a call to this function with the same 2010 * First check if we already did a call to this function with the same
1994 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2011 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2050,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2033 goto out_cache; 2050 goto out_cache;
2034 } 2051 }
2035 2052
2036get_ref:
2037 /* 2053 /*
2038 * Depending on whether the inode was already processed or not, use 2054 * Depending on whether the inode was already processed or not, use
2039 * send_root or parent_root for ref lookup. 2055 * send_root or parent_root for ref lookup.
2040 */ 2056 */
2041 if (ino < sctx->send_progress && !skip_name_cache) 2057 if (ino < sctx->send_progress)
2042 ret = get_first_ref(sctx->send_root, ino, 2058 ret = get_first_ref(sctx->send_root, ino,
2043 parent_ino, parent_gen, dest); 2059 parent_ino, parent_gen, dest);
2044 else 2060 else
@@ -2062,8 +2078,6 @@ get_ref:
2062 goto out; 2078 goto out;
2063 ret = 1; 2079 ret = 1;
2064 } 2080 }
2065 if (skip_name_cache)
2066 goto out;
2067 2081
2068out_cache: 2082out_cache:
2069 /* 2083 /*
@@ -2131,9 +2145,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2131 u64 parent_inode = 0; 2145 u64 parent_inode = 0;
2132 u64 parent_gen = 0; 2146 u64 parent_gen = 0;
2133 int stop = 0; 2147 int stop = 0;
2134 u64 start_ino = ino;
2135 u64 start_gen = gen;
2136 int skip_name_cache = 0;
2137 2148
2138 name = fs_path_alloc(); 2149 name = fs_path_alloc();
2139 if (!name) { 2150 if (!name) {
@@ -2141,31 +2152,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2141 goto out; 2152 goto out;
2142 } 2153 }
2143 2154
2144 if (is_waiting_for_move(sctx, ino))
2145 skip_name_cache = 1;
2146
2147again:
2148 dest->reversed = 1; 2155 dest->reversed = 1;
2149 fs_path_reset(dest); 2156 fs_path_reset(dest);
2150 2157
2151 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2158 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2152 fs_path_reset(name); 2159 fs_path_reset(name);
2153 2160
2154 ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, 2161 if (is_waiting_for_rm(sctx, ino)) {
2155 &parent_inode, &parent_gen, name); 2162 ret = gen_unique_name(sctx, ino, gen, name);
2163 if (ret < 0)
2164 goto out;
2165 ret = fs_path_add_path(dest, name);
2166 break;
2167 }
2168
2169 if (is_waiting_for_move(sctx, ino)) {
2170 ret = get_first_ref(sctx->parent_root, ino,
2171 &parent_inode, &parent_gen, name);
2172 } else {
2173 ret = __get_cur_name_and_parent(sctx, ino, gen,
2174 &parent_inode,
2175 &parent_gen, name);
2176 if (ret)
2177 stop = 1;
2178 }
2179
2156 if (ret < 0) 2180 if (ret < 0)
2157 goto out; 2181 goto out;
2158 if (ret)
2159 stop = 1;
2160
2161 if (!skip_name_cache &&
2162 is_waiting_for_move(sctx, parent_inode)) {
2163 ino = start_ino;
2164 gen = start_gen;
2165 stop = 0;
2166 skip_name_cache = 1;
2167 goto again;
2168 }
2169 2182
2170 ret = fs_path_add_path(dest, name); 2183 ret = fs_path_add_path(dest, name);
2171 if (ret < 0) 2184 if (ret < 0)
@@ -2429,10 +2442,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2429 if (!p) 2442 if (!p)
2430 return -ENOMEM; 2443 return -ENOMEM;
2431 2444
2432 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, 2445 if (ino != sctx->cur_ino) {
2433 NULL, &rdev); 2446 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2434 if (ret < 0) 2447 NULL, NULL, &rdev);
2435 goto out; 2448 if (ret < 0)
2449 goto out;
2450 } else {
2451 gen = sctx->cur_inode_gen;
2452 mode = sctx->cur_inode_mode;
2453 rdev = sctx->cur_inode_rdev;
2454 }
2436 2455
2437 if (S_ISREG(mode)) { 2456 if (S_ISREG(mode)) {
2438 cmd = BTRFS_SEND_C_MKFILE; 2457 cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2531,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2512 key.objectid = dir; 2531 key.objectid = dir;
2513 key.type = BTRFS_DIR_INDEX_KEY; 2532 key.type = BTRFS_DIR_INDEX_KEY;
2514 key.offset = 0; 2533 key.offset = 0;
2534 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2535 if (ret < 0)
2536 goto out;
2537
2515 while (1) { 2538 while (1) {
2516 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, 2539 eb = path->nodes[0];
2517 1, 0); 2540 slot = path->slots[0];
2518 if (ret < 0) 2541 if (slot >= btrfs_header_nritems(eb)) {
2519 goto out; 2542 ret = btrfs_next_leaf(sctx->send_root, path);
2520 if (!ret) { 2543 if (ret < 0) {
2521 eb = path->nodes[0]; 2544 goto out;
2522 slot = path->slots[0]; 2545 } else if (ret > 0) {
2523 btrfs_item_key_to_cpu(eb, &found_key, slot); 2546 ret = 0;
2547 break;
2548 }
2549 continue;
2524 } 2550 }
2525 if (ret || found_key.objectid != key.objectid || 2551
2552 btrfs_item_key_to_cpu(eb, &found_key, slot);
2553 if (found_key.objectid != key.objectid ||
2526 found_key.type != key.type) { 2554 found_key.type != key.type) {
2527 ret = 0; 2555 ret = 0;
2528 goto out; 2556 goto out;
@@ -2537,8 +2565,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2537 goto out; 2565 goto out;
2538 } 2566 }
2539 2567
2540 key.offset = found_key.offset + 1; 2568 path->slots[0]++;
2541 btrfs_release_path(path);
2542 } 2569 }
2543 2570
2544out: 2571out:
@@ -2590,7 +2617,7 @@ struct recorded_ref {
2590 * everything mixed. So we first record all refs and later process them. 2617 * everything mixed. So we first record all refs and later process them.
2591 * This function is a helper to record one ref. 2618 * This function is a helper to record one ref.
2592 */ 2619 */
2593static int record_ref(struct list_head *head, u64 dir, 2620static int __record_ref(struct list_head *head, u64 dir,
2594 u64 dir_gen, struct fs_path *path) 2621 u64 dir_gen, struct fs_path *path)
2595{ 2622{
2596 struct recorded_ref *ref; 2623 struct recorded_ref *ref;
@@ -2676,12 +2703,78 @@ out:
2676 return ret; 2703 return ret;
2677} 2704}
2678 2705
2706static struct orphan_dir_info *
2707add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2708{
2709 struct rb_node **p = &sctx->orphan_dirs.rb_node;
2710 struct rb_node *parent = NULL;
2711 struct orphan_dir_info *entry, *odi;
2712
2713 odi = kmalloc(sizeof(*odi), GFP_NOFS);
2714 if (!odi)
2715 return ERR_PTR(-ENOMEM);
2716 odi->ino = dir_ino;
2717 odi->gen = 0;
2718
2719 while (*p) {
2720 parent = *p;
2721 entry = rb_entry(parent, struct orphan_dir_info, node);
2722 if (dir_ino < entry->ino) {
2723 p = &(*p)->rb_left;
2724 } else if (dir_ino > entry->ino) {
2725 p = &(*p)->rb_right;
2726 } else {
2727 kfree(odi);
2728 return entry;
2729 }
2730 }
2731
2732 rb_link_node(&odi->node, parent, p);
2733 rb_insert_color(&odi->node, &sctx->orphan_dirs);
2734 return odi;
2735}
2736
2737static struct orphan_dir_info *
2738get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2739{
2740 struct rb_node *n = sctx->orphan_dirs.rb_node;
2741 struct orphan_dir_info *entry;
2742
2743 while (n) {
2744 entry = rb_entry(n, struct orphan_dir_info, node);
2745 if (dir_ino < entry->ino)
2746 n = n->rb_left;
2747 else if (dir_ino > entry->ino)
2748 n = n->rb_right;
2749 else
2750 return entry;
2751 }
2752 return NULL;
2753}
2754
2755static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
2756{
2757 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
2758
2759 return odi != NULL;
2760}
2761
2762static void free_orphan_dir_info(struct send_ctx *sctx,
2763 struct orphan_dir_info *odi)
2764{
2765 if (!odi)
2766 return;
2767 rb_erase(&odi->node, &sctx->orphan_dirs);
2768 kfree(odi);
2769}
2770
2679/* 2771/*
2680 * Returns 1 if a directory can be removed at this point in time. 2772 * Returns 1 if a directory can be removed at this point in time.
2681 * We check this by iterating all dir items and checking if the inode behind 2773 * We check this by iterating all dir items and checking if the inode behind
2682 * the dir item was already processed. 2774 * the dir item was already processed.
2683 */ 2775 */
2684static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) 2776static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2777 u64 send_progress)
2685{ 2778{
2686 int ret = 0; 2779 int ret = 0;
2687 struct btrfs_root *root = sctx->parent_root; 2780 struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2797,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2704 key.objectid = dir; 2797 key.objectid = dir;
2705 key.type = BTRFS_DIR_INDEX_KEY; 2798 key.type = BTRFS_DIR_INDEX_KEY;
2706 key.offset = 0; 2799 key.offset = 0;
2800 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2801 if (ret < 0)
2802 goto out;
2707 2803
2708 while (1) { 2804 while (1) {
2709 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2805 struct waiting_dir_move *dm;
2710 if (ret < 0) 2806
2711 goto out; 2807 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2712 if (!ret) { 2808 ret = btrfs_next_leaf(root, path);
2713 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2809 if (ret < 0)
2714 path->slots[0]); 2810 goto out;
2811 else if (ret > 0)
2812 break;
2813 continue;
2715 } 2814 }
2716 if (ret || found_key.objectid != key.objectid || 2815 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2717 found_key.type != key.type) { 2816 path->slots[0]);
2817 if (found_key.objectid != key.objectid ||
2818 found_key.type != key.type)
2718 break; 2819 break;
2719 }
2720 2820
2721 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 2821 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2722 struct btrfs_dir_item); 2822 struct btrfs_dir_item);
2723 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 2823 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2724 2824
2825 dm = get_waiting_dir_move(sctx, loc.objectid);
2826 if (dm) {
2827 struct orphan_dir_info *odi;
2828
2829 odi = add_orphan_dir_info(sctx, dir);
2830 if (IS_ERR(odi)) {
2831 ret = PTR_ERR(odi);
2832 goto out;
2833 }
2834 odi->gen = dir_gen;
2835 dm->rmdir_ino = dir;
2836 ret = 0;
2837 goto out;
2838 }
2839
2725 if (loc.objectid > send_progress) { 2840 if (loc.objectid > send_progress) {
2726 ret = 0; 2841 ret = 0;
2727 goto out; 2842 goto out;
2728 } 2843 }
2729 2844
2730 btrfs_release_path(path); 2845 path->slots[0]++;
2731 key.offset = found_key.offset + 1;
2732 } 2846 }
2733 2847
2734 ret = 1; 2848 ret = 1;
@@ -2740,19 +2854,9 @@ out:
2740 2854
2741static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 2855static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
2742{ 2856{
2743 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2857 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
2744 struct waiting_dir_move *entry;
2745 2858
2746 while (n) { 2859 return entry != NULL;
2747 entry = rb_entry(n, struct waiting_dir_move, node);
2748 if (ino < entry->ino)
2749 n = n->rb_left;
2750 else if (ino > entry->ino)
2751 n = n->rb_right;
2752 else
2753 return 1;
2754 }
2755 return 0;
2756} 2860}
2757 2861
2758static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2862static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2869,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2765 if (!dm) 2869 if (!dm)
2766 return -ENOMEM; 2870 return -ENOMEM;
2767 dm->ino = ino; 2871 dm->ino = ino;
2872 dm->rmdir_ino = 0;
2768 2873
2769 while (*p) { 2874 while (*p) {
2770 parent = *p; 2875 parent = *p;
@@ -2784,31 +2889,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2784 return 0; 2889 return 0;
2785} 2890}
2786 2891
2787static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2892static struct waiting_dir_move *
2893get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2788{ 2894{
2789 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2895 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2790 struct waiting_dir_move *entry; 2896 struct waiting_dir_move *entry;
2791 2897
2792 while (n) { 2898 while (n) {
2793 entry = rb_entry(n, struct waiting_dir_move, node); 2899 entry = rb_entry(n, struct waiting_dir_move, node);
2794 if (ino < entry->ino) { 2900 if (ino < entry->ino)
2795 n = n->rb_left; 2901 n = n->rb_left;
2796 } else if (ino > entry->ino) { 2902 else if (ino > entry->ino)
2797 n = n->rb_right; 2903 n = n->rb_right;
2798 } else { 2904 else
2799 rb_erase(&entry->node, &sctx->waiting_dir_moves); 2905 return entry;
2800 kfree(entry);
2801 return 0;
2802 }
2803 } 2906 }
2804 return -ENOENT; 2907 return NULL;
2908}
2909
2910static void free_waiting_dir_move(struct send_ctx *sctx,
2911 struct waiting_dir_move *dm)
2912{
2913 if (!dm)
2914 return;
2915 rb_erase(&dm->node, &sctx->waiting_dir_moves);
2916 kfree(dm);
2805} 2917}
2806 2918
2807static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) 2919static int add_pending_dir_move(struct send_ctx *sctx,
2920 u64 ino,
2921 u64 ino_gen,
2922 u64 parent_ino)
2808{ 2923{
2809 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2924 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2810 struct rb_node *parent = NULL; 2925 struct rb_node *parent = NULL;
2811 struct pending_dir_move *entry, *pm; 2926 struct pending_dir_move *entry = NULL, *pm;
2812 struct recorded_ref *cur; 2927 struct recorded_ref *cur;
2813 int exists = 0; 2928 int exists = 0;
2814 int ret; 2929 int ret;
@@ -2817,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
2817 if (!pm) 2932 if (!pm)
2818 return -ENOMEM; 2933 return -ENOMEM;
2819 pm->parent_ino = parent_ino; 2934 pm->parent_ino = parent_ino;
2820 pm->ino = sctx->cur_ino; 2935 pm->ino = ino;
2821 pm->gen = sctx->cur_inode_gen; 2936 pm->gen = ino_gen;
2822 INIT_LIST_HEAD(&pm->list); 2937 INIT_LIST_HEAD(&pm->list);
2823 INIT_LIST_HEAD(&pm->update_refs); 2938 INIT_LIST_HEAD(&pm->update_refs);
2824 RB_CLEAR_NODE(&pm->node); 2939 RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3003,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2888{ 3003{
2889 struct fs_path *from_path = NULL; 3004 struct fs_path *from_path = NULL;
2890 struct fs_path *to_path = NULL; 3005 struct fs_path *to_path = NULL;
3006 struct fs_path *name = NULL;
2891 u64 orig_progress = sctx->send_progress; 3007 u64 orig_progress = sctx->send_progress;
2892 struct recorded_ref *cur; 3008 struct recorded_ref *cur;
3009 u64 parent_ino, parent_gen;
3010 struct waiting_dir_move *dm = NULL;
3011 u64 rmdir_ino = 0;
2893 int ret; 3012 int ret;
2894 3013
3014 name = fs_path_alloc();
2895 from_path = fs_path_alloc(); 3015 from_path = fs_path_alloc();
2896 if (!from_path) 3016 if (!name || !from_path) {
2897 return -ENOMEM; 3017 ret = -ENOMEM;
3018 goto out;
3019 }
2898 3020
2899 sctx->send_progress = pm->ino; 3021 dm = get_waiting_dir_move(sctx, pm->ino);
2900 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3022 ASSERT(dm);
3023 rmdir_ino = dm->rmdir_ino;
3024 free_waiting_dir_move(sctx, dm);
3025
3026 ret = get_first_ref(sctx->parent_root, pm->ino,
3027 &parent_ino, &parent_gen, name);
2901 if (ret < 0) 3028 if (ret < 0)
2902 goto out; 3029 goto out;
2903 3030
3031 if (parent_ino == sctx->cur_ino) {
3032 /* child only renamed, not moved */
3033 ASSERT(parent_gen == sctx->cur_inode_gen);
3034 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3035 from_path);
3036 if (ret < 0)
3037 goto out;
3038 ret = fs_path_add_path(from_path, name);
3039 if (ret < 0)
3040 goto out;
3041 } else {
3042 /* child moved and maybe renamed too */
3043 sctx->send_progress = pm->ino;
3044 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
3045 if (ret < 0)
3046 goto out;
3047 }
3048
3049 fs_path_free(name);
3050 name = NULL;
3051
2904 to_path = fs_path_alloc(); 3052 to_path = fs_path_alloc();
2905 if (!to_path) { 3053 if (!to_path) {
2906 ret = -ENOMEM; 3054 ret = -ENOMEM;
@@ -2908,9 +3056,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2908 } 3056 }
2909 3057
2910 sctx->send_progress = sctx->cur_ino + 1; 3058 sctx->send_progress = sctx->cur_ino + 1;
2911 ret = del_waiting_dir_move(sctx, pm->ino);
2912 ASSERT(ret == 0);
2913
2914 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3059 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
2915 if (ret < 0) 3060 if (ret < 0)
2916 goto out; 3061 goto out;
@@ -2919,6 +3064,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2919 if (ret < 0) 3064 if (ret < 0)
2920 goto out; 3065 goto out;
2921 3066
3067 if (rmdir_ino) {
3068 struct orphan_dir_info *odi;
3069
3070 odi = get_orphan_dir_info(sctx, rmdir_ino);
3071 if (!odi) {
3072 /* already deleted */
3073 goto finish;
3074 }
3075 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
3076 if (ret < 0)
3077 goto out;
3078 if (!ret)
3079 goto finish;
3080
3081 name = fs_path_alloc();
3082 if (!name) {
3083 ret = -ENOMEM;
3084 goto out;
3085 }
3086 ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
3087 if (ret < 0)
3088 goto out;
3089 ret = send_rmdir(sctx, name);
3090 if (ret < 0)
3091 goto out;
3092 free_orphan_dir_info(sctx, odi);
3093 }
3094
3095finish:
2922 ret = send_utimes(sctx, pm->ino, pm->gen); 3096 ret = send_utimes(sctx, pm->ino, pm->gen);
2923 if (ret < 0) 3097 if (ret < 0)
2924 goto out; 3098 goto out;
@@ -2928,12 +3102,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2928 * and old parent(s). 3102 * and old parent(s).
2929 */ 3103 */
2930 list_for_each_entry(cur, &pm->update_refs, list) { 3104 list_for_each_entry(cur, &pm->update_refs, list) {
3105 if (cur->dir == rmdir_ino)
3106 continue;
2931 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3107 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
2932 if (ret < 0) 3108 if (ret < 0)
2933 goto out; 3109 goto out;
2934 } 3110 }
2935 3111
2936out: 3112out:
3113 fs_path_free(name);
2937 fs_path_free(from_path); 3114 fs_path_free(from_path);
2938 fs_path_free(to_path); 3115 fs_path_free(to_path);
2939 sctx->send_progress = orig_progress; 3116 sctx->send_progress = orig_progress;
@@ -3005,17 +3182,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3005 int ret; 3182 int ret;
3006 u64 ino = parent_ref->dir; 3183 u64 ino = parent_ref->dir;
3007 u64 parent_ino_before, parent_ino_after; 3184 u64 parent_ino_before, parent_ino_after;
3008 u64 new_gen, old_gen; 3185 u64 old_gen;
3009 struct fs_path *path_before = NULL; 3186 struct fs_path *path_before = NULL;
3010 struct fs_path *path_after = NULL; 3187 struct fs_path *path_after = NULL;
3011 int len1, len2; 3188 int len1, len2;
3012 3189 int register_upper_dirs;
3013 if (parent_ref->dir <= sctx->cur_ino) 3190 u64 gen;
3014 return 0;
3015 3191
3016 if (is_waiting_for_move(sctx, ino)) 3192 if (is_waiting_for_move(sctx, ino))
3017 return 1; 3193 return 1;
3018 3194
3195 if (parent_ref->dir <= sctx->cur_ino)
3196 return 0;
3197
3019 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, 3198 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3020 NULL, NULL, NULL, NULL); 3199 NULL, NULL, NULL, NULL);
3021 if (ret == -ENOENT) 3200 if (ret == -ENOENT)
@@ -3023,12 +3202,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3023 else if (ret < 0) 3202 else if (ret < 0)
3024 return ret; 3203 return ret;
3025 3204
3026 ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, 3205 if (parent_ref->dir_gen != old_gen)
3027 NULL, NULL, NULL, NULL);
3028 if (ret < 0)
3029 return ret;
3030
3031 if (new_gen != old_gen)
3032 return 0; 3206 return 0;
3033 3207
3034 path_before = fs_path_alloc(); 3208 path_before = fs_path_alloc();
@@ -3051,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3051 } 3225 }
3052 3226
3053 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3227 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3054 NULL, path_after); 3228 &gen, path_after);
3055 if (ret == -ENOENT) { 3229 if (ret == -ENOENT) {
3056 ret = 0; 3230 ret = 0;
3057 goto out; 3231 goto out;
@@ -3061,13 +3235,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3061 3235
3062 len1 = fs_path_len(path_before); 3236 len1 = fs_path_len(path_before);
3063 len2 = fs_path_len(path_after); 3237 len2 = fs_path_len(path_after);
3064 if ((parent_ino_before != parent_ino_after) && (len1 != len2 || 3238 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3065 memcmp(path_before->start, path_after->start, len1))) { 3239 memcmp(path_before->start, path_after->start, len1)) {
3066 ret = 1; 3240 ret = 1;
3067 goto out; 3241 goto out;
3068 } 3242 }
3069 ret = 0; 3243 ret = 0;
3070 3244
3245 /*
3246 * Ok, our new most direct ancestor has a higher inode number but
3247 * wasn't moved/renamed. So maybe some of the new ancestors higher in
3248 * the hierarchy have an higher inode number too *and* were renamed
3249 * or moved - in this case we need to wait for the ancestor's rename
3250 * or move operation before we can do the move/rename for the current
3251 * inode.
3252 */
3253 register_upper_dirs = 0;
3254 ino = parent_ino_after;
3255again:
3256 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
3257 u64 parent_gen;
3258
3259 fs_path_reset(path_before);
3260 fs_path_reset(path_after);
3261
3262 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3263 &parent_gen, path_after);
3264 if (ret < 0)
3265 goto out;
3266 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3267 NULL, path_before);
3268 if (ret == -ENOENT) {
3269 ret = 0;
3270 break;
3271 } else if (ret < 0) {
3272 goto out;
3273 }
3274
3275 len1 = fs_path_len(path_before);
3276 len2 = fs_path_len(path_after);
3277 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3278 memcmp(path_before->start, path_after->start, len1)) {
3279 ret = 1;
3280 if (register_upper_dirs) {
3281 break;
3282 } else {
3283 register_upper_dirs = 1;
3284 ino = parent_ref->dir;
3285 gen = parent_ref->dir_gen;
3286 goto again;
3287 }
3288 } else if (register_upper_dirs) {
3289 ret = add_pending_dir_move(sctx, ino, gen,
3290 parent_ino_after);
3291 if (ret < 0 && ret != -EEXIST)
3292 goto out;
3293 }
3294
3295 ino = parent_ino_after;
3296 gen = parent_gen;
3297 }
3298
3071out: 3299out:
3072 fs_path_free(path_before); 3300 fs_path_free(path_before);
3073 fs_path_free(path_after); 3301 fs_path_free(path_after);
@@ -3089,6 +3317,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3089 u64 ow_gen; 3317 u64 ow_gen;
3090 int did_overwrite = 0; 3318 int did_overwrite = 0;
3091 int is_orphan = 0; 3319 int is_orphan = 0;
3320 u64 last_dir_ino_rm = 0;
3092 3321
3093verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 3322verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3094 3323
@@ -3227,9 +3456,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3227 * dirs, we always have one new and one deleted 3456 * dirs, we always have one new and one deleted
3228 * ref. The deleted ref is ignored later. 3457 * ref. The deleted ref is ignored later.
3229 */ 3458 */
3230 if (wait_for_parent_move(sctx, cur)) { 3459 ret = wait_for_parent_move(sctx, cur);
3460 if (ret < 0)
3461 goto out;
3462 if (ret) {
3231 ret = add_pending_dir_move(sctx, 3463 ret = add_pending_dir_move(sctx,
3232 cur->dir); 3464 sctx->cur_ino,
3465 sctx->cur_inode_gen,
3466 cur->dir);
3233 *pending_move = 1; 3467 *pending_move = 1;
3234 } else { 3468 } else {
3235 ret = send_rename(sctx, valid_path, 3469 ret = send_rename(sctx, valid_path,
@@ -3259,7 +3493,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3259 * later, we do this check again and rmdir it then if possible. 3493 * later, we do this check again and rmdir it then if possible.
3260 * See the use of check_dirs for more details. 3494 * See the use of check_dirs for more details.
3261 */ 3495 */
3262 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); 3496 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3497 sctx->cur_ino);
3263 if (ret < 0) 3498 if (ret < 0)
3264 goto out; 3499 goto out;
3265 if (ret) { 3500 if (ret) {
@@ -3350,8 +3585,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3350 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3585 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3351 if (ret < 0) 3586 if (ret < 0)
3352 goto out; 3587 goto out;
3353 } else if (ret == inode_state_did_delete) { 3588 } else if (ret == inode_state_did_delete &&
3354 ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); 3589 cur->dir != last_dir_ino_rm) {
3590 ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
3591 sctx->cur_ino);
3355 if (ret < 0) 3592 if (ret < 0)
3356 goto out; 3593 goto out;
3357 if (ret) { 3594 if (ret) {
@@ -3362,6 +3599,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3362 ret = send_rmdir(sctx, valid_path); 3599 ret = send_rmdir(sctx, valid_path);
3363 if (ret < 0) 3600 if (ret < 0)
3364 goto out; 3601 goto out;
3602 last_dir_ino_rm = cur->dir;
3365 } 3603 }
3366 } 3604 }
3367 } 3605 }
@@ -3375,9 +3613,8 @@ out:
3375 return ret; 3613 return ret;
3376} 3614}
3377 3615
3378static int __record_new_ref(int num, u64 dir, int index, 3616static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
3379 struct fs_path *name, 3617 struct fs_path *name, void *ctx, struct list_head *refs)
3380 void *ctx)
3381{ 3618{
3382 int ret = 0; 3619 int ret = 0;
3383 struct send_ctx *sctx = ctx; 3620 struct send_ctx *sctx = ctx;
@@ -3388,7 +3625,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3388 if (!p) 3625 if (!p)
3389 return -ENOMEM; 3626 return -ENOMEM;
3390 3627
3391 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3628 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
3392 NULL, NULL); 3629 NULL, NULL);
3393 if (ret < 0) 3630 if (ret < 0)
3394 goto out; 3631 goto out;
@@ -3400,7 +3637,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3400 if (ret < 0) 3637 if (ret < 0)
3401 goto out; 3638 goto out;
3402 3639
3403 ret = record_ref(&sctx->new_refs, dir, gen, p); 3640 ret = __record_ref(refs, dir, gen, p);
3404 3641
3405out: 3642out:
3406 if (ret) 3643 if (ret)
@@ -3408,37 +3645,23 @@ out:
3408 return ret; 3645 return ret;
3409} 3646}
3410 3647
3648static int __record_new_ref(int num, u64 dir, int index,
3649 struct fs_path *name,
3650 void *ctx)
3651{
3652 struct send_ctx *sctx = ctx;
3653 return record_ref(sctx->send_root, num, dir, index, name,
3654 ctx, &sctx->new_refs);
3655}
3656
3657
3411static int __record_deleted_ref(int num, u64 dir, int index, 3658static int __record_deleted_ref(int num, u64 dir, int index,
3412 struct fs_path *name, 3659 struct fs_path *name,
3413 void *ctx) 3660 void *ctx)
3414{ 3661{
3415 int ret = 0;
3416 struct send_ctx *sctx = ctx; 3662 struct send_ctx *sctx = ctx;
3417 struct fs_path *p; 3663 return record_ref(sctx->parent_root, num, dir, index, name,
3418 u64 gen; 3664 ctx, &sctx->deleted_refs);
3419
3420 p = fs_path_alloc();
3421 if (!p)
3422 return -ENOMEM;
3423
3424 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3425 NULL, NULL);
3426 if (ret < 0)
3427 goto out;
3428
3429 ret = get_cur_path(sctx, dir, gen, p);
3430 if (ret < 0)
3431 goto out;
3432 ret = fs_path_add_path(p, name);
3433 if (ret < 0)
3434 goto out;
3435
3436 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3437
3438out:
3439 if (ret)
3440 fs_path_free(p);
3441 return ret;
3442} 3665}
3443 3666
3444static int record_new_ref(struct send_ctx *sctx) 3667static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3842,31 @@ static int process_all_refs(struct send_ctx *sctx,
3619 root = sctx->parent_root; 3842 root = sctx->parent_root;
3620 cb = __record_deleted_ref; 3843 cb = __record_deleted_ref;
3621 } else { 3844 } else {
3622 BUG(); 3845 btrfs_err(sctx->send_root->fs_info,
3846 "Wrong command %d in process_all_refs", cmd);
3847 ret = -EINVAL;
3848 goto out;
3623 } 3849 }
3624 3850
3625 key.objectid = sctx->cmp_key->objectid; 3851 key.objectid = sctx->cmp_key->objectid;
3626 key.type = BTRFS_INODE_REF_KEY; 3852 key.type = BTRFS_INODE_REF_KEY;
3627 key.offset = 0; 3853 key.offset = 0;
3628 while (1) { 3854 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3629 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3855 if (ret < 0)
3630 if (ret < 0) 3856 goto out;
3631 goto out;
3632 if (ret)
3633 break;
3634 3857
3858 while (1) {
3635 eb = path->nodes[0]; 3859 eb = path->nodes[0];
3636 slot = path->slots[0]; 3860 slot = path->slots[0];
3861 if (slot >= btrfs_header_nritems(eb)) {
3862 ret = btrfs_next_leaf(root, path);
3863 if (ret < 0)
3864 goto out;
3865 else if (ret > 0)
3866 break;
3867 continue;
3868 }
3869
3637 btrfs_item_key_to_cpu(eb, &found_key, slot); 3870 btrfs_item_key_to_cpu(eb, &found_key, slot);
3638 3871
3639 if (found_key.objectid != key.objectid || 3872 if (found_key.objectid != key.objectid ||
@@ -3642,11 +3875,10 @@ static int process_all_refs(struct send_ctx *sctx,
3642 break; 3875 break;
3643 3876
3644 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 3877 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3645 btrfs_release_path(path);
3646 if (ret < 0) 3878 if (ret < 0)
3647 goto out; 3879 goto out;
3648 3880
3649 key.offset = found_key.offset + 1; 3881 path->slots[0]++;
3650 } 3882 }
3651 btrfs_release_path(path); 3883 btrfs_release_path(path);
3652 3884
@@ -3927,19 +4159,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3927 key.objectid = sctx->cmp_key->objectid; 4159 key.objectid = sctx->cmp_key->objectid;
3928 key.type = BTRFS_XATTR_ITEM_KEY; 4160 key.type = BTRFS_XATTR_ITEM_KEY;
3929 key.offset = 0; 4161 key.offset = 0;
3930 while (1) { 4162 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3931 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 4163 if (ret < 0)
3932 if (ret < 0) 4164 goto out;
3933 goto out;
3934 if (ret) {
3935 ret = 0;
3936 goto out;
3937 }
3938 4165
4166 while (1) {
3939 eb = path->nodes[0]; 4167 eb = path->nodes[0];
3940 slot = path->slots[0]; 4168 slot = path->slots[0];
3941 btrfs_item_key_to_cpu(eb, &found_key, slot); 4169 if (slot >= btrfs_header_nritems(eb)) {
4170 ret = btrfs_next_leaf(root, path);
4171 if (ret < 0) {
4172 goto out;
4173 } else if (ret > 0) {
4174 ret = 0;
4175 break;
4176 }
4177 continue;
4178 }
3942 4179
4180 btrfs_item_key_to_cpu(eb, &found_key, slot);
3943 if (found_key.objectid != key.objectid || 4181 if (found_key.objectid != key.objectid ||
3944 found_key.type != key.type) { 4182 found_key.type != key.type) {
3945 ret = 0; 4183 ret = 0;
@@ -3951,8 +4189,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3951 if (ret < 0) 4189 if (ret < 0)
3952 goto out; 4190 goto out;
3953 4191
3954 btrfs_release_path(path); 4192 path->slots[0]++;
3955 key.offset = found_key.offset + 1;
3956 } 4193 }
3957 4194
3958out: 4195out:
@@ -3991,6 +4228,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
3991 goto out; 4228 goto out;
3992 4229
3993 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; 4230 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
4231
4232 /* initial readahead */
4233 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4234 file_ra_state_init(&sctx->ra, inode->i_mapping);
4235 btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
4236 last_index - index + 1);
4237
3994 while (index <= last_index) { 4238 while (index <= last_index) {
3995 unsigned cur_len = min_t(unsigned, len, 4239 unsigned cur_len = min_t(unsigned, len,
3996 PAGE_CACHE_SIZE - pg_offset); 4240 PAGE_CACHE_SIZE - pg_offset);
@@ -4763,18 +5007,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4763 ret = apply_children_dir_moves(sctx); 5007 ret = apply_children_dir_moves(sctx);
4764 if (ret) 5008 if (ret)
4765 goto out; 5009 goto out;
5010 /*
5011 * Need to send that every time, no matter if it actually
5012 * changed between the two trees as we have done changes to
5013 * the inode before. If our inode is a directory and it's
5014 * waiting to be moved/renamed, we will send its utimes when
5015 * it's moved/renamed, therefore we don't need to do it here.
5016 */
5017 sctx->send_progress = sctx->cur_ino + 1;
5018 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
5019 if (ret < 0)
5020 goto out;
4766 } 5021 }
4767 5022
4768 /*
4769 * Need to send that every time, no matter if it actually
4770 * changed between the two trees as we have done changes to
4771 * the inode before.
4772 */
4773 sctx->send_progress = sctx->cur_ino + 1;
4774 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4775 if (ret < 0)
4776 goto out;
4777
4778out: 5023out:
4779 return ret; 5024 return ret;
4780} 5025}
@@ -4840,6 +5085,8 @@ static int changed_inode(struct send_ctx *sctx,
4840 sctx->left_path->nodes[0], left_ii); 5085 sctx->left_path->nodes[0], left_ii);
4841 sctx->cur_inode_mode = btrfs_inode_mode( 5086 sctx->cur_inode_mode = btrfs_inode_mode(
4842 sctx->left_path->nodes[0], left_ii); 5087 sctx->left_path->nodes[0], left_ii);
5088 sctx->cur_inode_rdev = btrfs_inode_rdev(
5089 sctx->left_path->nodes[0], left_ii);
4843 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 5090 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4844 ret = send_create_inode_if_needed(sctx); 5091 ret = send_create_inode_if_needed(sctx);
4845 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 5092 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5131,8 @@ static int changed_inode(struct send_ctx *sctx,
4884 sctx->left_path->nodes[0], left_ii); 5131 sctx->left_path->nodes[0], left_ii);
4885 sctx->cur_inode_mode = btrfs_inode_mode( 5132 sctx->cur_inode_mode = btrfs_inode_mode(
4886 sctx->left_path->nodes[0], left_ii); 5133 sctx->left_path->nodes[0], left_ii);
5134 sctx->cur_inode_rdev = btrfs_inode_rdev(
5135 sctx->left_path->nodes[0], left_ii);
4887 ret = send_create_inode_if_needed(sctx); 5136 ret = send_create_inode_if_needed(sctx);
4888 if (ret < 0) 5137 if (ret < 0)
4889 goto out; 5138 goto out;
@@ -5118,6 +5367,7 @@ out:
5118static int full_send_tree(struct send_ctx *sctx) 5367static int full_send_tree(struct send_ctx *sctx)
5119{ 5368{
5120 int ret; 5369 int ret;
5370 struct btrfs_trans_handle *trans = NULL;
5121 struct btrfs_root *send_root = sctx->send_root; 5371 struct btrfs_root *send_root = sctx->send_root;
5122 struct btrfs_key key; 5372 struct btrfs_key key;
5123 struct btrfs_key found_key; 5373 struct btrfs_key found_key;
@@ -5139,6 +5389,19 @@ static int full_send_tree(struct send_ctx *sctx)
5139 key.type = BTRFS_INODE_ITEM_KEY; 5389 key.type = BTRFS_INODE_ITEM_KEY;
5140 key.offset = 0; 5390 key.offset = 0;
5141 5391
5392join_trans:
5393 /*
5394 * We need to make sure the transaction does not get committed
5395 * while we do anything on commit roots. Join a transaction to prevent
5396 * this.
5397 */
5398 trans = btrfs_join_transaction(send_root);
5399 if (IS_ERR(trans)) {
5400 ret = PTR_ERR(trans);
5401 trans = NULL;
5402 goto out;
5403 }
5404
5142 /* 5405 /*
5143 * Make sure the tree has not changed after re-joining. We detect this 5406 * Make sure the tree has not changed after re-joining. We detect this
5144 * by comparing start_ctransid and ctransid. They should always match. 5407 * by comparing start_ctransid and ctransid. They should always match.
@@ -5162,6 +5425,19 @@ static int full_send_tree(struct send_ctx *sctx)
5162 goto out_finish; 5425 goto out_finish;
5163 5426
5164 while (1) { 5427 while (1) {
5428 /*
5429 * When someone want to commit while we iterate, end the
5430 * joined transaction and rejoin.
5431 */
5432 if (btrfs_should_end_transaction(trans, send_root)) {
5433 ret = btrfs_end_transaction(trans, send_root);
5434 trans = NULL;
5435 if (ret < 0)
5436 goto out;
5437 btrfs_release_path(path);
5438 goto join_trans;
5439 }
5440
5165 eb = path->nodes[0]; 5441 eb = path->nodes[0];
5166 slot = path->slots[0]; 5442 slot = path->slots[0];
5167 btrfs_item_key_to_cpu(eb, &found_key, slot); 5443 btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5189,6 +5465,12 @@ out_finish:
5189 5465
5190out: 5466out:
5191 btrfs_free_path(path); 5467 btrfs_free_path(path);
5468 if (trans) {
5469 if (!ret)
5470 ret = btrfs_end_transaction(trans, send_root);
5471 else
5472 btrfs_end_transaction(trans, send_root);
5473 }
5192 return ret; 5474 return ret;
5193} 5475}
5194 5476
@@ -5340,6 +5622,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5340 5622
5341 sctx->pending_dir_moves = RB_ROOT; 5623 sctx->pending_dir_moves = RB_ROOT;
5342 sctx->waiting_dir_moves = RB_ROOT; 5624 sctx->waiting_dir_moves = RB_ROOT;
5625 sctx->orphan_dirs = RB_ROOT;
5343 5626
5344 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 5627 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
5345 (arg->clone_sources_count + 1)); 5628 (arg->clone_sources_count + 1));
@@ -5477,6 +5760,16 @@ out:
5477 kfree(dm); 5760 kfree(dm);
5478 } 5761 }
5479 5762
5763 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
5764 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
5765 struct rb_node *n;
5766 struct orphan_dir_info *odi;
5767
5768 n = rb_first(&sctx->orphan_dirs);
5769 odi = rb_entry(n, struct orphan_dir_info, node);
5770 free_orphan_dir_info(sctx, odi);
5771 }
5772
5480 if (sort_clone_roots) { 5773 if (sort_clone_roots) {
5481 for (i = 0; i < sctx->clone_roots_cnt; i++) 5774 for (i = 0; i < sctx->clone_roots_cnt; i++)
5482 btrfs_root_dec_send_in_progress( 5775 btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..d4878ddba87a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1305,13 +1305,6 @@ error_fs_info:
1305 return ERR_PTR(error); 1305 return ERR_PTR(error);
1306} 1306}
1307 1307
1308static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1309{
1310 spin_lock_irq(&workers->lock);
1311 workers->max_workers = new_limit;
1312 spin_unlock_irq(&workers->lock);
1313}
1314
1315static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, 1308static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1316 int new_pool_size, int old_pool_size) 1309 int new_pool_size, int old_pool_size)
1317{ 1310{
@@ -1323,21 +1316,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1323 btrfs_info(fs_info, "resize thread pool %d -> %d", 1316 btrfs_info(fs_info, "resize thread pool %d -> %d",
1324 old_pool_size, new_pool_size); 1317 old_pool_size, new_pool_size);
1325 1318
1326 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1319 btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1327 btrfs_set_max_workers(&fs_info->workers, new_pool_size); 1320 btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1328 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); 1321 btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
1329 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); 1322 btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1330 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); 1323 btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
1331 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); 1324 btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
1332 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); 1325 btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
1333 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); 1326 new_pool_size);
1334 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); 1327 btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
1335 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); 1328 btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1336 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1329 btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1337 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1330 btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1338 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1331 btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
1339 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1332 new_pool_size);
1340 new_pool_size);
1341} 1333}
1342 1334
1343static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) 1335static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1479,6 +1471,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1479 sb->s_flags &= ~MS_RDONLY; 1471 sb->s_flags &= ~MS_RDONLY;
1480 } 1472 }
1481out: 1473out:
1474 wake_up_process(fs_info->transaction_kthread);
1482 btrfs_remount_cleanup(fs_info, old_opts); 1475 btrfs_remount_cleanup(fs_info, old_opts);
1483 return 0; 1476 return 0;
1484 1477
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
24#include <linux/kobject.h> 24#include <linux/kobject.h>
25#include <linux/bug.h> 25#include <linux/bug.h>
26#include <linux/genhd.h> 26#include <linux/genhd.h>
27#include <linux/debugfs.h>
27 28
28#include "ctree.h" 29#include "ctree.h"
29#include "disk-io.h" 30#include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
599/* /sys/fs/btrfs/ entry */ 600/* /sys/fs/btrfs/ entry */
600static struct kset *btrfs_kset; 601static struct kset *btrfs_kset;
601 602
603/* /sys/kernel/debug/btrfs */
604static struct dentry *btrfs_debugfs_root_dentry;
605
606/* Debugging tunables and exported data */
607u64 btrfs_debugfs_test;
608
602int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 609int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
603{ 610{
604 int error; 611 int error;
@@ -642,27 +649,41 @@ failure:
642 return error; 649 return error;
643} 650}
644 651
652static int btrfs_init_debugfs(void)
653{
654#ifdef CONFIG_DEBUG_FS
655 btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
656 if (!btrfs_debugfs_root_dentry)
657 return -ENOMEM;
658
659 debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
660 &btrfs_debugfs_test);
661#endif
662 return 0;
663}
664
645int btrfs_init_sysfs(void) 665int btrfs_init_sysfs(void)
646{ 666{
647 int ret; 667 int ret;
668
648 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 669 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
649 if (!btrfs_kset) 670 if (!btrfs_kset)
650 return -ENOMEM; 671 return -ENOMEM;
651 672
652 init_feature_attrs(); 673 ret = btrfs_init_debugfs();
674 if (ret)
675 return ret;
653 676
677 init_feature_attrs();
654 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 678 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
655 if (ret) {
656 kset_unregister(btrfs_kset);
657 return ret;
658 }
659 679
660 return 0; 680 return ret;
661} 681}
662 682
663void btrfs_exit_sysfs(void) 683void btrfs_exit_sysfs(void)
664{ 684{
665 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 685 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
666 kset_unregister(btrfs_kset); 686 kset_unregister(btrfs_kset);
687 debugfs_remove_recursive(btrfs_debugfs_root_dentry);
667} 688}
668 689
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
1#ifndef _BTRFS_SYSFS_H_ 1#ifndef _BTRFS_SYSFS_H_
2#define _BTRFS_SYSFS_H_ 2#define _BTRFS_SYSFS_H_
3 3
4/*
5 * Data exported through sysfs
6 */
7extern u64 btrfs_debugfs_test;
8
4enum btrfs_feature_set { 9enum btrfs_feature_set {
5 FEAT_COMPAT, 10 FEAT_COMPAT,
6 FEAT_COMPAT_RO, 11 FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..a04707f740d6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
683 int lock = (trans->type != TRANS_JOIN_NOLOCK); 683 int lock = (trans->type != TRANS_JOIN_NOLOCK);
684 int err = 0; 684 int err = 0;
685 685
686 if (--trans->use_count) { 686 if (trans->use_count > 1) {
687 trans->use_count--;
687 trans->block_rsv = trans->orig_rsv; 688 trans->block_rsv = trans->orig_rsv;
688 return 0; 689 return 0;
689 } 690 }
@@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
731 } 732 }
732 733
733 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { 734 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
734 if (throttle) { 735 if (throttle)
735 /*
736 * We may race with somebody else here so end up having
737 * to call end_transaction on ourselves again, so inc
738 * our use_count.
739 */
740 trans->use_count++;
741 return btrfs_commit_transaction(trans, root); 736 return btrfs_commit_transaction(trans, root);
742 } else { 737 else
743 wake_up_process(info->transaction_kthread); 738 wake_up_process(info->transaction_kthread);
744 }
745 } 739 }
746 740
747 if (trans->type & __TRANS_FREEZABLE) 741 if (trans->type & __TRANS_FREEZABLE)
@@ -1578,10 +1572,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1578 1572
1579 trace_btrfs_transaction_commit(root); 1573 trace_btrfs_transaction_commit(root);
1580 1574
1581 btrfs_scrub_continue(root);
1582
1583 if (current->journal_info == trans) 1575 if (current->journal_info == trans)
1584 current->journal_info = NULL; 1576 current->journal_info = NULL;
1577 btrfs_scrub_cancel(root->fs_info);
1585 1578
1586 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1579 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1587} 1580}
@@ -1621,7 +1614,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1621static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1614static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1622{ 1615{
1623 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1616 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1624 return btrfs_start_delalloc_roots(fs_info, 1); 1617 return btrfs_start_delalloc_roots(fs_info, 1, -1);
1625 return 0; 1618 return 0;
1626} 1619}
1627 1620
@@ -1754,7 +1747,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 /* ->aborted might be set after the previous check, so check it */ 1747 /* ->aborted might be set after the previous check, so check it */
1755 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1748 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1756 ret = cur_trans->aborted; 1749 ret = cur_trans->aborted;
1757 goto cleanup_transaction; 1750 goto scrub_continue;
1758 } 1751 }
1759 /* 1752 /*
1760 * the reloc mutex makes sure that we stop 1753 * the reloc mutex makes sure that we stop
@@ -1771,7 +1764,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771 ret = create_pending_snapshots(trans, root->fs_info); 1764 ret = create_pending_snapshots(trans, root->fs_info);
1772 if (ret) { 1765 if (ret) {
1773 mutex_unlock(&root->fs_info->reloc_mutex); 1766 mutex_unlock(&root->fs_info->reloc_mutex);
1774 goto cleanup_transaction; 1767 goto scrub_continue;
1775 } 1768 }
1776 1769
1777 /* 1770 /*
@@ -1787,13 +1780,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1787 ret = btrfs_run_delayed_items(trans, root); 1780 ret = btrfs_run_delayed_items(trans, root);
1788 if (ret) { 1781 if (ret) {
1789 mutex_unlock(&root->fs_info->reloc_mutex); 1782 mutex_unlock(&root->fs_info->reloc_mutex);
1790 goto cleanup_transaction; 1783 goto scrub_continue;
1791 } 1784 }
1792 1785
1793 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1786 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1794 if (ret) { 1787 if (ret) {
1795 mutex_unlock(&root->fs_info->reloc_mutex); 1788 mutex_unlock(&root->fs_info->reloc_mutex);
1796 goto cleanup_transaction; 1789 goto scrub_continue;
1797 } 1790 }
1798 1791
1799 /* 1792 /*
@@ -1823,7 +1816,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1823 if (ret) { 1816 if (ret) {
1824 mutex_unlock(&root->fs_info->tree_log_mutex); 1817 mutex_unlock(&root->fs_info->tree_log_mutex);
1825 mutex_unlock(&root->fs_info->reloc_mutex); 1818 mutex_unlock(&root->fs_info->reloc_mutex);
1826 goto cleanup_transaction; 1819 goto scrub_continue;
1827 } 1820 }
1828 1821
1829 /* 1822 /*
@@ -1844,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1844 if (ret) { 1837 if (ret) {
1845 mutex_unlock(&root->fs_info->tree_log_mutex); 1838 mutex_unlock(&root->fs_info->tree_log_mutex);
1846 mutex_unlock(&root->fs_info->reloc_mutex); 1839 mutex_unlock(&root->fs_info->reloc_mutex);
1847 goto cleanup_transaction; 1840 goto scrub_continue;
1848 } 1841 }
1849 1842
1850 /* 1843 /*
@@ -1855,7 +1848,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1855 ret = cur_trans->aborted; 1848 ret = cur_trans->aborted;
1856 mutex_unlock(&root->fs_info->tree_log_mutex); 1849 mutex_unlock(&root->fs_info->tree_log_mutex);
1857 mutex_unlock(&root->fs_info->reloc_mutex); 1850 mutex_unlock(&root->fs_info->reloc_mutex);
1858 goto cleanup_transaction; 1851 goto scrub_continue;
1859 } 1852 }
1860 1853
1861 btrfs_prepare_extent_commit(trans, root); 1854 btrfs_prepare_extent_commit(trans, root);
@@ -1891,13 +1884,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1891 btrfs_error(root->fs_info, ret, 1884 btrfs_error(root->fs_info, ret,
1892 "Error while writing out transaction"); 1885 "Error while writing out transaction");
1893 mutex_unlock(&root->fs_info->tree_log_mutex); 1886 mutex_unlock(&root->fs_info->tree_log_mutex);
1894 goto cleanup_transaction; 1887 goto scrub_continue;
1895 } 1888 }
1896 1889
1897 ret = write_ctree_super(trans, root, 0); 1890 ret = write_ctree_super(trans, root, 0);
1898 if (ret) { 1891 if (ret) {
1899 mutex_unlock(&root->fs_info->tree_log_mutex); 1892 mutex_unlock(&root->fs_info->tree_log_mutex);
1900 goto cleanup_transaction; 1893 goto scrub_continue;
1901 } 1894 }
1902 1895
1903 /* 1896 /*
@@ -1940,6 +1933,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1940 1933
1941 return ret; 1934 return ret;
1942 1935
1936scrub_continue:
1937 btrfs_scrub_continue(root);
1943cleanup_transaction: 1938cleanup_transaction:
1944 btrfs_trans_release_metadata(trans, root); 1939 btrfs_trans_release_metadata(trans, root);
1945 trans->block_rsv = NULL; 1940 trans->block_rsv = NULL;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
136 * syncing the tree wait for us to finish 136 * syncing the tree wait for us to finish
137 */ 137 */
138static int start_log_trans(struct btrfs_trans_handle *trans, 138static int start_log_trans(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root) 139 struct btrfs_root *root,
140 struct btrfs_log_ctx *ctx)
140{ 141{
142 int index;
141 int ret; 143 int ret;
142 int err = 0;
143 144
144 mutex_lock(&root->log_mutex); 145 mutex_lock(&root->log_mutex);
145 if (root->log_root) { 146 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
148 trans->transid) {
149 ret = -EAGAIN;
150 goto out;
151 }
152
146 if (!root->log_start_pid) { 153 if (!root->log_start_pid) {
147 root->log_start_pid = current->pid; 154 root->log_start_pid = current->pid;
148 root->log_multiple_pids = false; 155 root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
152 159
153 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_batch);
154 atomic_inc(&root->log_writers); 161 atomic_inc(&root->log_writers);
162 if (ctx) {
163 index = root->log_transid % 2;
164 list_add_tail(&ctx->list, &root->log_ctxs[index]);
165 ctx->log_transid = root->log_transid;
166 }
155 mutex_unlock(&root->log_mutex); 167 mutex_unlock(&root->log_mutex);
156 return 0; 168 return 0;
157 } 169 }
158 root->log_multiple_pids = false; 170
159 root->log_start_pid = current->pid; 171 ret = 0;
160 mutex_lock(&root->fs_info->tree_log_mutex); 172 mutex_lock(&root->fs_info->tree_log_mutex);
161 if (!root->fs_info->log_root_tree) { 173 if (!root->fs_info->log_root_tree)
162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 ret = btrfs_init_log_root_tree(trans, root->fs_info);
163 if (ret) 175 mutex_unlock(&root->fs_info->tree_log_mutex);
164 err = ret; 176 if (ret)
165 } 177 goto out;
166 if (err == 0 && !root->log_root) { 178
179 if (!root->log_root) {
167 ret = btrfs_add_log_tree(trans, root); 180 ret = btrfs_add_log_tree(trans, root);
168 if (ret) 181 if (ret)
169 err = ret; 182 goto out;
170 } 183 }
171 mutex_unlock(&root->fs_info->tree_log_mutex); 184 root->log_multiple_pids = false;
185 root->log_start_pid = current->pid;
172 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_batch);
173 atomic_inc(&root->log_writers); 187 atomic_inc(&root->log_writers);
188 if (ctx) {
189 index = root->log_transid % 2;
190 list_add_tail(&ctx->list, &root->log_ctxs[index]);
191 ctx->log_transid = root->log_transid;
192 }
193out:
174 mutex_unlock(&root->log_mutex); 194 mutex_unlock(&root->log_mutex);
175 return err; 195 return ret;
176} 196}
177 197
178/* 198/*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
2359 return ret; 2379 return ret;
2360} 2380}
2361 2381
2362static int wait_log_commit(struct btrfs_trans_handle *trans, 2382static void wait_log_commit(struct btrfs_trans_handle *trans,
2363 struct btrfs_root *root, unsigned long transid) 2383 struct btrfs_root *root, int transid)
2364{ 2384{
2365 DEFINE_WAIT(wait); 2385 DEFINE_WAIT(wait);
2366 int index = transid % 2; 2386 int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
2375 &wait, TASK_UNINTERRUPTIBLE); 2395 &wait, TASK_UNINTERRUPTIBLE);
2376 mutex_unlock(&root->log_mutex); 2396 mutex_unlock(&root->log_mutex);
2377 2397
2378 if (root->fs_info->last_trans_log_full_commit != 2398 if (root->log_transid_committed < transid &&
2379 trans->transid && root->log_transid < transid + 2 &&
2380 atomic_read(&root->log_commit[index])) 2399 atomic_read(&root->log_commit[index]))
2381 schedule(); 2400 schedule();
2382 2401
2383 finish_wait(&root->log_commit_wait[index], &wait); 2402 finish_wait(&root->log_commit_wait[index], &wait);
2384 mutex_lock(&root->log_mutex); 2403 mutex_lock(&root->log_mutex);
2385 } while (root->fs_info->last_trans_log_full_commit != 2404 } while (root->log_transid_committed < transid &&
2386 trans->transid && root->log_transid < transid + 2 &&
2387 atomic_read(&root->log_commit[index])); 2405 atomic_read(&root->log_commit[index]));
2388 return 0;
2389} 2406}
2390 2407
2391static void wait_for_writer(struct btrfs_trans_handle *trans, 2408static void wait_for_writer(struct btrfs_trans_handle *trans,
2392 struct btrfs_root *root) 2409 struct btrfs_root *root)
2393{ 2410{
2394 DEFINE_WAIT(wait); 2411 DEFINE_WAIT(wait);
2395 while (root->fs_info->last_trans_log_full_commit != 2412
2396 trans->transid && atomic_read(&root->log_writers)) { 2413 while (atomic_read(&root->log_writers)) {
2397 prepare_to_wait(&root->log_writer_wait, 2414 prepare_to_wait(&root->log_writer_wait,
2398 &wait, TASK_UNINTERRUPTIBLE); 2415 &wait, TASK_UNINTERRUPTIBLE);
2399 mutex_unlock(&root->log_mutex); 2416 mutex_unlock(&root->log_mutex);
2400 if (root->fs_info->last_trans_log_full_commit != 2417 if (atomic_read(&root->log_writers))
2401 trans->transid && atomic_read(&root->log_writers))
2402 schedule(); 2418 schedule();
2403 mutex_lock(&root->log_mutex); 2419 mutex_lock(&root->log_mutex);
2404 finish_wait(&root->log_writer_wait, &wait); 2420 finish_wait(&root->log_writer_wait, &wait);
2405 } 2421 }
2406} 2422}
2407 2423
2424static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2425 struct btrfs_log_ctx *ctx)
2426{
2427 if (!ctx)
2428 return;
2429
2430 mutex_lock(&root->log_mutex);
2431 list_del_init(&ctx->list);
2432 mutex_unlock(&root->log_mutex);
2433}
2434
2435/*
2436 * Invoked in log mutex context, or be sure there is no other task which
2437 * can access the list.
2438 */
2439static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2440 int index, int error)
2441{
2442 struct btrfs_log_ctx *ctx;
2443
2444 if (!error) {
2445 INIT_LIST_HEAD(&root->log_ctxs[index]);
2446 return;
2447 }
2448
2449 list_for_each_entry(ctx, &root->log_ctxs[index], list)
2450 ctx->log_ret = error;
2451
2452 INIT_LIST_HEAD(&root->log_ctxs[index]);
2453}
2454
2408/* 2455/*
2409 * btrfs_sync_log does sends a given tree log down to the disk and 2456 * btrfs_sync_log does sends a given tree log down to the disk and
2410 * updates the super blocks to record it. When this call is done, 2457 * updates the super blocks to record it. When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
2418 * that has happened. 2465 * that has happened.
2419 */ 2466 */
2420int btrfs_sync_log(struct btrfs_trans_handle *trans, 2467int btrfs_sync_log(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root) 2468 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2422{ 2469{
2423 int index1; 2470 int index1;
2424 int index2; 2471 int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2426 int ret; 2473 int ret;
2427 struct btrfs_root *log = root->log_root; 2474 struct btrfs_root *log = root->log_root;
2428 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2475 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2429 unsigned long log_transid = 0; 2476 int log_transid = 0;
2477 struct btrfs_log_ctx root_log_ctx;
2430 struct blk_plug plug; 2478 struct blk_plug plug;
2431 2479
2432 mutex_lock(&root->log_mutex); 2480 mutex_lock(&root->log_mutex);
2433 log_transid = root->log_transid; 2481 log_transid = ctx->log_transid;
2434 index1 = root->log_transid % 2; 2482 if (root->log_transid_committed >= log_transid) {
2483 mutex_unlock(&root->log_mutex);
2484 return ctx->log_ret;
2485 }
2486
2487 index1 = log_transid % 2;
2435 if (atomic_read(&root->log_commit[index1])) { 2488 if (atomic_read(&root->log_commit[index1])) {
2436 wait_log_commit(trans, root, root->log_transid); 2489 wait_log_commit(trans, root, log_transid);
2437 mutex_unlock(&root->log_mutex); 2490 mutex_unlock(&root->log_mutex);
2438 return 0; 2491 return ctx->log_ret;
2439 } 2492 }
2493 ASSERT(log_transid == root->log_transid);
2440 atomic_set(&root->log_commit[index1], 1); 2494 atomic_set(&root->log_commit[index1], 1);
2441 2495
2442 /* wait for previous tree log sync to complete */ 2496 /* wait for previous tree log sync to complete */
2443 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2497 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2444 wait_log_commit(trans, root, root->log_transid - 1); 2498 wait_log_commit(trans, root, log_transid - 1);
2499
2445 while (1) { 2500 while (1) {
2446 int batch = atomic_read(&root->log_batch); 2501 int batch = atomic_read(&root->log_batch);
2447 /* when we're on an ssd, just kick the log commit out */ 2502 /* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2456 } 2511 }
2457 2512
2458 /* bail out if we need to do a full commit */ 2513 /* bail out if we need to do a full commit */
2459 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2515 trans->transid) {
2460 ret = -EAGAIN; 2516 ret = -EAGAIN;
2461 btrfs_free_logged_extents(log, log_transid); 2517 btrfs_free_logged_extents(log, log_transid);
2462 mutex_unlock(&root->log_mutex); 2518 mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2477 blk_finish_plug(&plug); 2533 blk_finish_plug(&plug);
2478 btrfs_abort_transaction(trans, root, ret); 2534 btrfs_abort_transaction(trans, root, ret);
2479 btrfs_free_logged_extents(log, log_transid); 2535 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2537 trans->transid;
2480 mutex_unlock(&root->log_mutex); 2538 mutex_unlock(&root->log_mutex);
2481 goto out; 2539 goto out;
2482 } 2540 }
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2486 root->log_transid++; 2544 root->log_transid++;
2487 log->log_transid = root->log_transid; 2545 log->log_transid = root->log_transid;
2488 root->log_start_pid = 0; 2546 root->log_start_pid = 0;
2489 smp_mb();
2490 /* 2547 /*
2491 * IO has been started, blocks of the log tree have WRITTEN flag set 2548 * IO has been started, blocks of the log tree have WRITTEN flag set
2492 * in their headers. new modifications of the log will be written to 2549 * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2494 */ 2551 */
2495 mutex_unlock(&root->log_mutex); 2552 mutex_unlock(&root->log_mutex);
2496 2553
2554 btrfs_init_log_ctx(&root_log_ctx);
2555
2497 mutex_lock(&log_root_tree->log_mutex); 2556 mutex_lock(&log_root_tree->log_mutex);
2498 atomic_inc(&log_root_tree->log_batch); 2557 atomic_inc(&log_root_tree->log_batch);
2499 atomic_inc(&log_root_tree->log_writers); 2558 atomic_inc(&log_root_tree->log_writers);
2559
2560 index2 = log_root_tree->log_transid % 2;
2561 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2562 root_log_ctx.log_transid = log_root_tree->log_transid;
2563
2500 mutex_unlock(&log_root_tree->log_mutex); 2564 mutex_unlock(&log_root_tree->log_mutex);
2501 2565
2502 ret = update_log_root(trans, log); 2566 ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2509 } 2573 }
2510 2574
2511 if (ret) { 2575 if (ret) {
2576 if (!list_empty(&root_log_ctx.list))
2577 list_del_init(&root_log_ctx.list);
2578
2512 blk_finish_plug(&plug); 2579 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2581 trans->transid;
2513 if (ret != -ENOSPC) { 2582 if (ret != -ENOSPC) {
2514 btrfs_abort_transaction(trans, root, ret); 2583 btrfs_abort_transaction(trans, root, ret);
2515 mutex_unlock(&log_root_tree->log_mutex); 2584 mutex_unlock(&log_root_tree->log_mutex);
2516 goto out; 2585 goto out;
2517 } 2586 }
2518 root->fs_info->last_trans_log_full_commit = trans->transid;
2519 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2587 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2520 btrfs_free_logged_extents(log, log_transid); 2588 btrfs_free_logged_extents(log, log_transid);
2521 mutex_unlock(&log_root_tree->log_mutex); 2589 mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2523 goto out; 2591 goto out;
2524 } 2592 }
2525 2593
2526 index2 = log_root_tree->log_transid % 2; 2594 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2595 mutex_unlock(&log_root_tree->log_mutex);
2596 ret = root_log_ctx.log_ret;
2597 goto out;
2598 }
2599
2600 index2 = root_log_ctx.log_transid % 2;
2527 if (atomic_read(&log_root_tree->log_commit[index2])) { 2601 if (atomic_read(&log_root_tree->log_commit[index2])) {
2528 blk_finish_plug(&plug); 2602 blk_finish_plug(&plug);
2529 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2603 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2530 wait_log_commit(trans, log_root_tree, 2604 wait_log_commit(trans, log_root_tree,
2531 log_root_tree->log_transid); 2605 root_log_ctx.log_transid);
2532 btrfs_free_logged_extents(log, log_transid); 2606 btrfs_free_logged_extents(log, log_transid);
2533 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2534 ret = 0; 2608 ret = root_log_ctx.log_ret;
2535 goto out; 2609 goto out;
2536 } 2610 }
2611 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2537 atomic_set(&log_root_tree->log_commit[index2], 1); 2612 atomic_set(&log_root_tree->log_commit[index2], 1);
2538 2613
2539 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2614 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2540 wait_log_commit(trans, log_root_tree, 2615 wait_log_commit(trans, log_root_tree,
2541 log_root_tree->log_transid - 1); 2616 root_log_ctx.log_transid - 1);
2542 } 2617 }
2543 2618
2544 wait_for_writer(trans, log_root_tree); 2619 wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2547 * now that we've moved on to the tree of log tree roots, 2622 * now that we've moved on to the tree of log tree roots,
2548 * check the full commit flag again 2623 * check the full commit flag again
2549 */ 2624 */
2550 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2626 trans->transid) {
2551 blk_finish_plug(&plug); 2627 blk_finish_plug(&plug);
2552 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2553 btrfs_free_logged_extents(log, log_transid); 2629 btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2561 EXTENT_DIRTY | EXTENT_NEW); 2637 EXTENT_DIRTY | EXTENT_NEW);
2562 blk_finish_plug(&plug); 2638 blk_finish_plug(&plug);
2563 if (ret) { 2639 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2641 trans->transid;
2564 btrfs_abort_transaction(trans, root, ret); 2642 btrfs_abort_transaction(trans, root, ret);
2565 btrfs_free_logged_extents(log, log_transid); 2643 btrfs_free_logged_extents(log, log_transid);
2566 mutex_unlock(&log_root_tree->log_mutex); 2644 mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2578 btrfs_header_level(log_root_tree->node)); 2656 btrfs_header_level(log_root_tree->node));
2579 2657
2580 log_root_tree->log_transid++; 2658 log_root_tree->log_transid++;
2581 smp_mb();
2582
2583 mutex_unlock(&log_root_tree->log_mutex); 2659 mutex_unlock(&log_root_tree->log_mutex);
2584 2660
2585 /* 2661 /*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2591 */ 2667 */
2592 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2593 if (ret) { 2669 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2671 trans->transid;
2594 btrfs_abort_transaction(trans, root, ret); 2672 btrfs_abort_transaction(trans, root, ret);
2595 goto out_wake_log_root; 2673 goto out_wake_log_root;
2596 } 2674 }
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2601 mutex_unlock(&root->log_mutex); 2679 mutex_unlock(&root->log_mutex);
2602 2680
2603out_wake_log_root: 2681out_wake_log_root:
2682 /*
2683 * We needn't get log_mutex here because we are sure all
2684 * the other tasks are blocked.
2685 */
2686 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2687
2688 mutex_lock(&log_root_tree->log_mutex);
2689 log_root_tree->log_transid_committed++;
2604 atomic_set(&log_root_tree->log_commit[index2], 0); 2690 atomic_set(&log_root_tree->log_commit[index2], 0);
2605 smp_mb(); 2691 mutex_unlock(&log_root_tree->log_mutex);
2692
2606 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2693 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2607 wake_up(&log_root_tree->log_commit_wait[index2]); 2694 wake_up(&log_root_tree->log_commit_wait[index2]);
2608out: 2695out:
2696 /* See above. */
2697 btrfs_remove_all_log_ctxs(root, index1, ret);
2698
2699 mutex_lock(&root->log_mutex);
2700 root->log_transid_committed++;
2609 atomic_set(&root->log_commit[index1], 0); 2701 atomic_set(&root->log_commit[index1], 0);
2610 smp_mb(); 2702 mutex_unlock(&root->log_mutex);
2703
2611 if (waitqueue_active(&root->log_commit_wait[index1])) 2704 if (waitqueue_active(&root->log_commit_wait[index1]))
2612 wake_up(&root->log_commit_wait[index1]); 2705 wake_up(&root->log_commit_wait[index1]);
2613 return ret; 2706 return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3479 3572
3480static int log_one_extent(struct btrfs_trans_handle *trans, 3573static int log_one_extent(struct btrfs_trans_handle *trans,
3481 struct inode *inode, struct btrfs_root *root, 3574 struct inode *inode, struct btrfs_root *root,
3482 struct extent_map *em, struct btrfs_path *path) 3575 struct extent_map *em, struct btrfs_path *path,
3576 struct list_head *logged_list)
3483{ 3577{
3484 struct btrfs_root *log = root->log_root; 3578 struct btrfs_root *log = root->log_root;
3485 struct btrfs_file_extent_item *fi; 3579 struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3495 u64 extent_offset = em->start - em->orig_start; 3589 u64 extent_offset = em->start - em->orig_start;
3496 u64 block_len; 3590 u64 block_len;
3497 int ret; 3591 int ret;
3498 int index = log->log_transid % 2;
3499 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3592 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3500 int extent_inserted = 0; 3593 int extent_inserted = 0;
3501 3594
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3579 * First check and see if our csums are on our outstanding ordered 3672 * First check and see if our csums are on our outstanding ordered
3580 * extents. 3673 * extents.
3581 */ 3674 */
3582again: 3675 list_for_each_entry(ordered, logged_list, log_list) {
3583 spin_lock_irq(&log->log_extents_lock[index]);
3584 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3585 struct btrfs_ordered_sum *sum; 3676 struct btrfs_ordered_sum *sum;
3586 3677
3587 if (!mod_len) 3678 if (!mod_len)
3588 break; 3679 break;
3589 3680
3590 if (ordered->inode != inode)
3591 continue;
3592
3593 if (ordered->file_offset + ordered->len <= mod_start || 3681 if (ordered->file_offset + ordered->len <= mod_start ||
3594 mod_start + mod_len <= ordered->file_offset) 3682 mod_start + mod_len <= ordered->file_offset)
3595 continue; 3683 continue;
@@ -3632,12 +3720,6 @@ again:
3632 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3720 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3633 &ordered->flags)) 3721 &ordered->flags))
3634 continue; 3722 continue;
3635 atomic_inc(&ordered->refs);
3636 spin_unlock_irq(&log->log_extents_lock[index]);
3637 /*
3638 * we've dropped the lock, we must either break or
3639 * start over after this.
3640 */
3641 3723
3642 if (ordered->csum_bytes_left) { 3724 if (ordered->csum_bytes_left) {
3643 btrfs_start_ordered_extent(inode, ordered, 0); 3725 btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
3647 3729
3648 list_for_each_entry(sum, &ordered->list, list) { 3730 list_for_each_entry(sum, &ordered->list, list) {
3649 ret = btrfs_csum_file_blocks(trans, log, sum); 3731 ret = btrfs_csum_file_blocks(trans, log, sum);
3650 if (ret) { 3732 if (ret)
3651 btrfs_put_ordered_extent(ordered);
3652 goto unlocked; 3733 goto unlocked;
3653 }
3654 } 3734 }
3655 btrfs_put_ordered_extent(ordered);
3656 goto again;
3657 3735
3658 } 3736 }
3659 spin_unlock_irq(&log->log_extents_lock[index]);
3660unlocked: 3737unlocked:
3661 3738
3662 if (!mod_len || ret) 3739 if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
3694static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3771static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3695 struct btrfs_root *root, 3772 struct btrfs_root *root,
3696 struct inode *inode, 3773 struct inode *inode,
3697 struct btrfs_path *path) 3774 struct btrfs_path *path,
3775 struct list_head *logged_list)
3698{ 3776{
3699 struct extent_map *em, *n; 3777 struct extent_map *em, *n;
3700 struct list_head extents; 3778 struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
3752 3830
3753 write_unlock(&tree->lock); 3831 write_unlock(&tree->lock);
3754 3832
3755 ret = log_one_extent(trans, inode, root, em, path); 3833 ret = log_one_extent(trans, inode, root, em, path, logged_list);
3756 write_lock(&tree->lock); 3834 write_lock(&tree->lock);
3757 clear_em_logging(tree, em); 3835 clear_em_logging(tree, em);
3758 free_extent_map(em); 3836 free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3788 struct btrfs_key max_key; 3866 struct btrfs_key max_key;
3789 struct btrfs_root *log = root->log_root; 3867 struct btrfs_root *log = root->log_root;
3790 struct extent_buffer *src = NULL; 3868 struct extent_buffer *src = NULL;
3869 LIST_HEAD(logged_list);
3791 u64 last_extent = 0; 3870 u64 last_extent = 0;
3792 int err = 0; 3871 int err = 0;
3793 int ret; 3872 int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3836 3915
3837 mutex_lock(&BTRFS_I(inode)->log_mutex); 3916 mutex_lock(&BTRFS_I(inode)->log_mutex);
3838 3917
3839 btrfs_get_logged_extents(log, inode); 3918 btrfs_get_logged_extents(inode, &logged_list);
3840 3919
3841 /* 3920 /*
3842 * a brute force approach to making sure we get the most uptodate 3921 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
3962 btrfs_release_path(path); 4041 btrfs_release_path(path);
3963 btrfs_release_path(dst_path); 4042 btrfs_release_path(dst_path);
3964 if (fast_search) { 4043 if (fast_search) {
3965 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 4044 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4045 &logged_list);
3966 if (ret) { 4046 if (ret) {
3967 err = ret; 4047 err = ret;
3968 goto out_unlock; 4048 goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
3987 BTRFS_I(inode)->logged_trans = trans->transid; 4067 BTRFS_I(inode)->logged_trans = trans->transid;
3988 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4068 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3989out_unlock: 4069out_unlock:
3990 if (err) 4070 if (unlikely(err))
3991 btrfs_free_logged_extents(log, log->log_transid); 4071 btrfs_put_logged_extents(&logged_list);
4072 else
4073 btrfs_submit_logged_extents(&logged_list, log);
3992 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4074 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3993 4075
3994 btrfs_free_path(path); 4076 btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
4079 */ 4161 */
4080static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4162static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4081 struct btrfs_root *root, struct inode *inode, 4163 struct btrfs_root *root, struct inode *inode,
4082 struct dentry *parent, int exists_only) 4164 struct dentry *parent, int exists_only,
4165 struct btrfs_log_ctx *ctx)
4083{ 4166{
4084 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4167 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4085 struct super_block *sb; 4168 struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4116 goto end_no_trans; 4199 goto end_no_trans;
4117 } 4200 }
4118 4201
4119 ret = start_log_trans(trans, root); 4202 ret = start_log_trans(trans, root, ctx);
4120 if (ret) 4203 if (ret)
4121 goto end_trans; 4204 goto end_no_trans;
4122 4205
4123 ret = btrfs_log_inode(trans, root, inode, inode_only); 4206 ret = btrfs_log_inode(trans, root, inode, inode_only);
4124 if (ret) 4207 if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
4166 root->fs_info->last_trans_log_full_commit = trans->transid; 4249 root->fs_info->last_trans_log_full_commit = trans->transid;
4167 ret = 1; 4250 ret = 1;
4168 } 4251 }
4252
4253 if (ret)
4254 btrfs_remove_log_ctx(root, ctx);
4169 btrfs_end_log_trans(root); 4255 btrfs_end_log_trans(root);
4170end_no_trans: 4256end_no_trans:
4171 return ret; 4257 return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
4178 * data on disk. 4264 * data on disk.
4179 */ 4265 */
4180int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4266int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4181 struct btrfs_root *root, struct dentry *dentry) 4267 struct btrfs_root *root, struct dentry *dentry,
4268 struct btrfs_log_ctx *ctx)
4182{ 4269{
4183 struct dentry *parent = dget_parent(dentry); 4270 struct dentry *parent = dget_parent(dentry);
4184 int ret; 4271 int ret;
4185 4272
4186 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4273 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4274 0, ctx);
4187 dput(parent); 4275 dput(parent);
4188 4276
4189 return ret; 4277 return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4420 root->fs_info->last_trans_committed)) 4508 root->fs_info->last_trans_committed))
4421 return 0; 4509 return 0;
4422 4510
4423 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4511 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4424} 4512}
4425 4513
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 23#define BTRFS_NO_LOG_SYNC 256
24 24
25struct btrfs_log_ctx {
26 int log_ret;
27 int log_transid;
28 struct list_head list;
29};
30
31static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
32{
33 ctx->log_ret = 0;
34 ctx->log_transid = 0;
35 INIT_LIST_HEAD(&ctx->list);
36}
37
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 38int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 39 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 41int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info); 42 struct btrfs_fs_info *fs_info);
30int btrfs_recover_log_trees(struct btrfs_root *tree_root); 43int btrfs_recover_log_trees(struct btrfs_root *tree_root);
31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 44int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct dentry *dentry); 45 struct btrfs_root *root, struct dentry *dentry,
46 struct btrfs_log_ctx *ctx);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 47int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 48 struct btrfs_root *root,
35 const char *name, int name_len, 49 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..d241130a32fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
415 device->running_pending = 1; 415 device->running_pending = 1;
416 416
417 spin_unlock(&device->io_lock); 417 spin_unlock(&device->io_lock);
418 btrfs_requeue_work(&device->work); 418 btrfs_queue_work(fs_info->submit_workers,
419 &device->work);
419 goto done; 420 goto done;
420 } 421 }
421 /* unplug every 64 requests just for good measure */ 422 /* unplug every 64 requests just for good measure */
@@ -5263,6 +5264,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5263static void btrfs_end_bio(struct bio *bio, int err) 5264static void btrfs_end_bio(struct bio *bio, int err)
5264{ 5265{
5265 struct btrfs_bio *bbio = bio->bi_private; 5266 struct btrfs_bio *bbio = bio->bi_private;
5267 struct btrfs_device *dev = bbio->stripes[0].dev;
5266 int is_orig_bio = 0; 5268 int is_orig_bio = 0;
5267 5269
5268 if (err) { 5270 if (err) {
@@ -5270,7 +5272,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5270 if (err == -EIO || err == -EREMOTEIO) { 5272 if (err == -EIO || err == -EREMOTEIO) {
5271 unsigned int stripe_index = 5273 unsigned int stripe_index =
5272 btrfs_io_bio(bio)->stripe_index; 5274 btrfs_io_bio(bio)->stripe_index;
5273 struct btrfs_device *dev;
5274 5275
5275 BUG_ON(stripe_index >= bbio->num_stripes); 5276 BUG_ON(stripe_index >= bbio->num_stripes);
5276 dev = bbio->stripes[stripe_index].dev; 5277 dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5293,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5292 if (bio == bbio->orig_bio) 5293 if (bio == bbio->orig_bio)
5293 is_orig_bio = 1; 5294 is_orig_bio = 1;
5294 5295
5296 btrfs_bio_counter_dec(bbio->fs_info);
5297
5295 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5298 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5296 if (!is_orig_bio) { 5299 if (!is_orig_bio) {
5297 bio_put(bio); 5300 bio_put(bio);
@@ -5328,13 +5331,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5328 } 5331 }
5329} 5332}
5330 5333
5331struct async_sched {
5332 struct bio *bio;
5333 int rw;
5334 struct btrfs_fs_info *info;
5335 struct btrfs_work work;
5336};
5337
5338/* 5334/*
5339 * see run_scheduled_bios for a description of why bios are collected for 5335 * see run_scheduled_bios for a description of why bios are collected for
5340 * async submit. 5336 * async submit.
@@ -5391,8 +5387,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5391 spin_unlock(&device->io_lock); 5387 spin_unlock(&device->io_lock);
5392 5388
5393 if (should_queue) 5389 if (should_queue)
5394 btrfs_queue_worker(&root->fs_info->submit_workers, 5390 btrfs_queue_work(root->fs_info->submit_workers,
5395 &device->work); 5391 &device->work);
5396} 5392}
5397 5393
5398static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5394static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5443,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5447 } 5443 }
5448#endif 5444#endif
5449 bio->bi_bdev = dev->bdev; 5445 bio->bi_bdev = dev->bdev;
5446
5447 btrfs_bio_counter_inc_noblocked(root->fs_info);
5448
5450 if (async) 5449 if (async)
5451 btrfs_schedule_bio(root, dev, rw, bio); 5450 btrfs_schedule_bio(root, dev, rw, bio);
5452 else 5451 else
@@ -5515,28 +5514,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5515 length = bio->bi_iter.bi_size; 5514 length = bio->bi_iter.bi_size;
5516 map_length = length; 5515 map_length = length;
5517 5516
5517 btrfs_bio_counter_inc_blocked(root->fs_info);
5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5519 mirror_num, &raid_map); 5519 mirror_num, &raid_map);
5520 if (ret) /* -ENOMEM */ 5520 if (ret) {
5521 btrfs_bio_counter_dec(root->fs_info);
5521 return ret; 5522 return ret;
5523 }
5522 5524
5523 total_devs = bbio->num_stripes; 5525 total_devs = bbio->num_stripes;
5524 bbio->orig_bio = first_bio; 5526 bbio->orig_bio = first_bio;
5525 bbio->private = first_bio->bi_private; 5527 bbio->private = first_bio->bi_private;
5526 bbio->end_io = first_bio->bi_end_io; 5528 bbio->end_io = first_bio->bi_end_io;
5529 bbio->fs_info = root->fs_info;
5527 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5530 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5528 5531
5529 if (raid_map) { 5532 if (raid_map) {
5530 /* In this case, map_length has been set to the length of 5533 /* In this case, map_length has been set to the length of
5531 a single stripe; not the whole write */ 5534 a single stripe; not the whole write */
5532 if (rw & WRITE) { 5535 if (rw & WRITE) {
5533 return raid56_parity_write(root, bio, bbio, 5536 ret = raid56_parity_write(root, bio, bbio,
5534 raid_map, map_length); 5537 raid_map, map_length);
5535 } else { 5538 } else {
5536 return raid56_parity_recover(root, bio, bbio, 5539 ret = raid56_parity_recover(root, bio, bbio,
5537 raid_map, map_length, 5540 raid_map, map_length,
5538 mirror_num); 5541 mirror_num);
5539 } 5542 }
5543 /*
5544 * FIXME, replace dosen't support raid56 yet, please fix
5545 * it in the future.
5546 */
5547 btrfs_bio_counter_dec(root->fs_info);
5548 return ret;
5540 } 5549 }
5541 5550
5542 if (map_length < length) { 5551 if (map_length < length) {
@@ -5578,6 +5587,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5578 async_submit); 5587 async_submit);
5579 dev_nr++; 5588 dev_nr++;
5580 } 5589 }
5590 btrfs_bio_counter_dec(root->fs_info);
5581 return 0; 5591 return 0;
5582} 5592}
5583 5593
@@ -5666,7 +5676,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5666 else 5676 else
5667 generate_random_uuid(dev->uuid); 5677 generate_random_uuid(dev->uuid);
5668 5678
5669 dev->work.func = pending_bios_fn; 5679 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
5670 5680
5671 return dev; 5681 return dev;
5672} 5682}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193struct btrfs_bio { 193struct btrfs_bio {
194 atomic_t stripes_pending; 194 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info;
195 bio_end_io_t *end_io; 196 bio_end_io_t *end_io;
196 struct bio *orig_bio; 197 struct bio *orig_bio;
197 void *private; 198 void *private;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 3176cdc32937..4ee4e30d26d9 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -21,6 +21,8 @@ struct btrfs_block_group_cache;
21struct btrfs_free_cluster; 21struct btrfs_free_cluster;
22struct map_lookup; 22struct map_lookup;
23struct extent_buffer; 23struct extent_buffer;
24struct btrfs_work;
25struct __btrfs_workqueue;
24 26
25#define show_ref_type(type) \ 27#define show_ref_type(type) \
26 __print_symbolic(type, \ 28 __print_symbolic(type, \
@@ -982,6 +984,141 @@ TRACE_EVENT(free_extent_state,
982 (void *)__entry->ip) 984 (void *)__entry->ip)
983); 985);
984 986
987DECLARE_EVENT_CLASS(btrfs__work,
988
989 TP_PROTO(struct btrfs_work *work),
990
991 TP_ARGS(work),
992
993 TP_STRUCT__entry(
994 __field( void *, work )
995 __field( void *, wq )
996 __field( void *, func )
997 __field( void *, ordered_func )
998 __field( void *, ordered_free )
999 ),
1000
1001 TP_fast_assign(
1002 __entry->work = work;
1003 __entry->wq = work->wq;
1004 __entry->func = work->func;
1005 __entry->ordered_func = work->ordered_func;
1006 __entry->ordered_free = work->ordered_free;
1007 ),
1008
1009 TP_printk("work=%p, wq=%p, func=%p, ordered_func=%p, ordered_free=%p",
1010 __entry->work, __entry->wq, __entry->func,
1011 __entry->ordered_func, __entry->ordered_free)
1012);
1013
1014/* For situiations that the work is freed */
1015DECLARE_EVENT_CLASS(btrfs__work__done,
1016
1017 TP_PROTO(struct btrfs_work *work),
1018
1019 TP_ARGS(work),
1020
1021 TP_STRUCT__entry(
1022 __field( void *, work )
1023 ),
1024
1025 TP_fast_assign(
1026 __entry->work = work;
1027 ),
1028
1029 TP_printk("work->%p", __entry->work)
1030);
1031
1032DEFINE_EVENT(btrfs__work, btrfs_work_queued,
1033
1034 TP_PROTO(struct btrfs_work *work),
1035
1036 TP_ARGS(work)
1037);
1038
1039DEFINE_EVENT(btrfs__work, btrfs_work_sched,
1040
1041 TP_PROTO(struct btrfs_work *work),
1042
1043 TP_ARGS(work)
1044);
1045
1046DEFINE_EVENT(btrfs__work, btrfs_normal_work_done,
1047
1048 TP_PROTO(struct btrfs_work *work),
1049
1050 TP_ARGS(work)
1051);
1052
1053DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done,
1054
1055 TP_PROTO(struct btrfs_work *work),
1056
1057 TP_ARGS(work)
1058);
1059
1060DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
1061
1062 TP_PROTO(struct btrfs_work *work),
1063
1064 TP_ARGS(work)
1065);
1066
1067DECLARE_EVENT_CLASS(btrfs__workqueue,
1068
1069 TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
1070
1071 TP_ARGS(wq, name, high),
1072
1073 TP_STRUCT__entry(
1074 __field( void *, wq )
1075 __string( name, name )
1076 __field( int , high )
1077 ),
1078
1079 TP_fast_assign(
1080 __entry->wq = wq;
1081 __assign_str(name, name);
1082 __entry->high = high;
1083 ),
1084
1085 TP_printk("name=%s%s, wq=%p", __get_str(name),
1086 __print_flags(__entry->high, "",
1087 {(WQ_HIGHPRI), "-high"}),
1088 __entry->wq)
1089);
1090
1091DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc,
1092
1093 TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
1094
1095 TP_ARGS(wq, name, high)
1096);
1097
1098DECLARE_EVENT_CLASS(btrfs__workqueue_done,
1099
1100 TP_PROTO(struct __btrfs_workqueue *wq),
1101
1102 TP_ARGS(wq),
1103
1104 TP_STRUCT__entry(
1105 __field( void *, wq )
1106 ),
1107
1108 TP_fast_assign(
1109 __entry->wq = wq;
1110 ),
1111
1112 TP_printk("wq=%p", __entry->wq)
1113);
1114
1115DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
1116
1117 TP_PROTO(struct __btrfs_workqueue *wq),
1118
1119 TP_ARGS(wq)
1120);
1121
985#endif /* _TRACE_BTRFS_H */ 1122#endif /* _TRACE_BTRFS_H */
986 1123
987/* This part must be outside protection */ 1124/* This part must be outside protection */