aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c848
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c84
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c11
-rw-r--r--fs/btrfs/ctree.h73
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/dev-replace.c79
-rw-r--r--fs/btrfs/disk-io.c281
-rw-r--r--fs/btrfs/extent-tree.c58
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/extent_map.c56
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c160
-rw-r--r--fs/btrfs/inode.c123
-rw-r--r--fs/btrfs/ioctl.c210
-rw-r--r--fs/btrfs/ordered-data.c68
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c15
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c97
-rw-r--r--fs/btrfs/send.c821
-rw-r--r--fs/btrfs/super.c38
-rw-r--r--fs/btrfs/sysfs.c33
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/transaction.c39
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c46
-rw-r--r--fs/btrfs/volumes.h1
35 files changed, 2004 insertions, 1629 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..ecb5832c0967 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -21,708 +22,313 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
24#include "async-thread.h" 26#include "async-thread.h"
27#include "ctree.h"
28
29#define WORK_DONE_BIT 0
30#define WORK_ORDER_DONE_BIT 1
31#define WORK_HIGH_PRIO_BIT 2
32
33#define NO_THRESHOLD (-1)
34#define DFT_THRESHOLD (32)
35
36struct __btrfs_workqueue {
37 struct workqueue_struct *normal_wq;
38 /* List head pointing to ordered work list */
39 struct list_head ordered_list;
40
41 /* Spinlock for ordered_list */
42 spinlock_t list_lock;
43
44 /* Thresholding related variants */
45 atomic_t pending;
46 int max_active;
47 int current_max;
48 int thresh;
49 unsigned int count;
50 spinlock_t thres_lock;
51};
25 52
26#define WORK_QUEUED_BIT 0 53struct btrfs_workqueue {
27#define WORK_DONE_BIT 1 54 struct __btrfs_workqueue *normal;
28#define WORK_ORDER_DONE_BIT 2 55 struct __btrfs_workqueue *high;
29#define WORK_HIGH_PRIO_BIT 3 56};
30
31/*
32 * container for the kthread task pointer and the list of pending work
33 * One of these is allocated per thread.
34 */
35struct btrfs_worker_thread {
36 /* pool we belong to */
37 struct btrfs_workers *workers;
38
39 /* list of struct btrfs_work that are waiting for service */
40 struct list_head pending;
41 struct list_head prio_pending;
42
43 /* list of worker threads from struct btrfs_workers */
44 struct list_head worker_list;
45
46 /* kthread */
47 struct task_struct *task;
48 57
49 /* number of things on the pending list */ 58static inline struct __btrfs_workqueue
50 atomic_t num_pending; 59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh)
61{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
51 63
52 /* reference counter for this struct */ 64 if (unlikely(!ret))
53 atomic_t refs; 65 return NULL;
54 66
55 unsigned long sequence; 67 ret->max_active = max_active;
68 atomic_set(&ret->pending, 0);
69 if (thresh == 0)
70 thresh = DFT_THRESHOLD;
71 /* For low threshold, disabling threshold is a better choice */
72 if (thresh < DFT_THRESHOLD) {
73 ret->current_max = max_active;
74 ret->thresh = NO_THRESHOLD;
75 } else {
76 ret->current_max = 1;
77 ret->thresh = thresh;
78 }
56 79
57 /* protects the pending list. */ 80 if (flags & WQ_HIGHPRI)
58 spinlock_t lock; 81 ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
82 ret->max_active,
83 "btrfs", name);
84 else
85 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs",
87 name);
88 if (unlikely(!ret->normal_wq)) {
89 kfree(ret);
90 return NULL;
91 }
59 92
60 /* set to non-zero when this thread is already awake and kicking */ 93 INIT_LIST_HEAD(&ret->ordered_list);
61 int working; 94 spin_lock_init(&ret->list_lock);
95 spin_lock_init(&ret->thres_lock);
96 trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
97 return ret;
98}
62 99
63 /* are we currently idle */ 100static inline void
64 int idle; 101__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
65};
66 102
67static int __btrfs_start_workers(struct btrfs_workers *workers); 103struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
104 int flags,
105 int max_active,
106 int thresh)
107{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
68 109
69/* 110 if (unlikely(!ret))
70 * btrfs_start_workers uses kthread_run, which can block waiting for memory 111 return NULL;
71 * for a very long time. It will actually throttle on page writeback,
72 * and so it may not make progress until after our btrfs worker threads
73 * process all of the pending work structs in their queue
74 *
75 * This means we can't use btrfs_start_workers from inside a btrfs worker
76 * thread that is used as part of cleaning dirty memory, which pretty much
77 * involves all of the worker threads.
78 *
79 * Instead we have a helper queue who never has more than one thread
80 * where we scheduler thread start operations. This worker_start struct
81 * is used to contain the work and hold a pointer to the queue that needs
82 * another worker.
83 */
84struct worker_start {
85 struct btrfs_work work;
86 struct btrfs_workers *queue;
87};
88 112
89static void start_new_worker_func(struct btrfs_work *work) 113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
90{ 114 max_active, thresh);
91 struct worker_start *start; 115 if (unlikely(!ret->normal)) {
92 start = container_of(work, struct worker_start, work); 116 kfree(ret);
93 __btrfs_start_workers(start->queue); 117 return NULL;
94 kfree(start); 118 }
95}
96 119
97/* 120 if (flags & WQ_HIGHPRI) {
98 * helper function to move a thread onto the idle list after it 121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
99 * has finished some requests. 122 thresh);
100 */ 123 if (unlikely(!ret->high)) {
101static void check_idle_worker(struct btrfs_worker_thread *worker) 124 __btrfs_destroy_workqueue(ret->normal);
102{ 125 kfree(ret);
103 if (!worker->idle && atomic_read(&worker->num_pending) < 126 return NULL;
104 worker->workers->idle_thresh / 2) {
105 unsigned long flags;
106 spin_lock_irqsave(&worker->workers->lock, flags);
107 worker->idle = 1;
108
109 /* the list may be empty if the worker is just starting */
110 if (!list_empty(&worker->worker_list) &&
111 !worker->workers->stopping) {
112 list_move(&worker->worker_list,
113 &worker->workers->idle_list);
114 } 127 }
115 spin_unlock_irqrestore(&worker->workers->lock, flags);
116 } 128 }
129 return ret;
117} 130}
118 131
119/* 132/*
120 * helper function to move a thread off the idle list after new 133 * Hook for threshold which will be called in btrfs_queue_work.
121 * pending work is added. 134 * This hook WILL be called in IRQ handler context,
135 * so workqueue_set_max_active MUST NOT be called in this hook
122 */ 136 */
123static void check_busy_worker(struct btrfs_worker_thread *worker) 137static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
124{ 138{
125 if (worker->idle && atomic_read(&worker->num_pending) >= 139 if (wq->thresh == NO_THRESHOLD)
126 worker->workers->idle_thresh) { 140 return;
127 unsigned long flags; 141 atomic_inc(&wq->pending);
128 spin_lock_irqsave(&worker->workers->lock, flags);
129 worker->idle = 0;
130
131 if (!list_empty(&worker->worker_list) &&
132 !worker->workers->stopping) {
133 list_move_tail(&worker->worker_list,
134 &worker->workers->worker_list);
135 }
136 spin_unlock_irqrestore(&worker->workers->lock, flags);
137 }
138} 142}
139 143
140static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 144/*
145 * Hook for threshold which will be called before executing the work,
146 * This hook is called in kthread content.
147 * So workqueue_set_max_active is called here.
148 */
149static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
141{ 150{
142 struct btrfs_workers *workers = worker->workers; 151 int new_max_active;
143 struct worker_start *start; 152 long pending;
144 unsigned long flags; 153 int need_change = 0;
145 154
146 rmb(); 155 if (wq->thresh == NO_THRESHOLD)
147 if (!workers->atomic_start_pending)
148 return; 156 return;
149 157
150 start = kzalloc(sizeof(*start), GFP_NOFS); 158 atomic_dec(&wq->pending);
151 if (!start) 159 spin_lock(&wq->thres_lock);
152 return; 160 /*
153 161 * Use wq->count to limit the calling frequency of
154 start->work.func = start_new_worker_func; 162 * workqueue_set_max_active.
155 start->queue = workers; 163 */
156 164 wq->count++;
157 spin_lock_irqsave(&workers->lock, flags); 165 wq->count %= (wq->thresh / 4);
158 if (!workers->atomic_start_pending) 166 if (!wq->count)
159 goto out; 167 goto out;
160 168 new_max_active = wq->current_max;
161 workers->atomic_start_pending = 0;
162 if (workers->num_workers + workers->num_workers_starting >=
163 workers->max_workers)
164 goto out;
165
166 workers->num_workers_starting += 1;
167 spin_unlock_irqrestore(&workers->lock, flags);
168 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
169 return;
170 169
170 /*
171 * pending may be changed later, but it's OK since we really
172 * don't need it so accurate to calculate new_max_active.
173 */
174 pending = atomic_read(&wq->pending);
175 if (pending > wq->thresh)
176 new_max_active++;
177 if (pending < wq->thresh / 2)
178 new_max_active--;
179 new_max_active = clamp_val(new_max_active, 1, wq->max_active);
180 if (new_max_active != wq->current_max) {
181 need_change = 1;
182 wq->current_max = new_max_active;
183 }
171out: 184out:
172 kfree(start); 185 spin_unlock(&wq->thres_lock);
173 spin_unlock_irqrestore(&workers->lock, flags); 186
187 if (need_change) {
188 workqueue_set_max_active(wq->normal_wq, wq->current_max);
189 }
174} 190}
175 191
176static noinline void run_ordered_completions(struct btrfs_workers *workers, 192static void run_ordered_work(struct __btrfs_workqueue *wq)
177 struct btrfs_work *work)
178{ 193{
179 if (!workers->ordered) 194 struct list_head *list = &wq->ordered_list;
180 return; 195 struct btrfs_work *work;
181 196 spinlock_t *lock = &wq->list_lock;
182 set_bit(WORK_DONE_BIT, &work->flags); 197 unsigned long flags;
183
184 spin_lock(&workers->order_lock);
185 198
186 while (1) { 199 while (1) {
187 if (!list_empty(&workers->prio_order_list)) { 200 spin_lock_irqsave(lock, flags);
188 work = list_entry(workers->prio_order_list.next, 201 if (list_empty(list))
189 struct btrfs_work, order_list);
190 } else if (!list_empty(&workers->order_list)) {
191 work = list_entry(workers->order_list.next,
192 struct btrfs_work, order_list);
193 } else {
194 break; 202 break;
195 } 203 work = list_entry(list->next, struct btrfs_work,
204 ordered_list);
196 if (!test_bit(WORK_DONE_BIT, &work->flags)) 205 if (!test_bit(WORK_DONE_BIT, &work->flags))
197 break; 206 break;
198 207
199 /* we are going to call the ordered done function, but 208 /*
209 * we are going to call the ordered done function, but
200 * we leave the work item on the list as a barrier so 210 * we leave the work item on the list as a barrier so
201 * that later work items that are done don't have their 211 * that later work items that are done don't have their
202 * functions called before this one returns 212 * functions called before this one returns
203 */ 213 */
204 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 214 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
205 break; 215 break;
206 216 trace_btrfs_ordered_sched(work);
207 spin_unlock(&workers->order_lock); 217 spin_unlock_irqrestore(lock, flags);
208
209 work->ordered_func(work); 218 work->ordered_func(work);
210 219
211 /* now take the lock again and drop our item from the list */ 220 /* now take the lock again and drop our item from the list */
212 spin_lock(&workers->order_lock); 221 spin_lock_irqsave(lock, flags);
213 list_del(&work->order_list); 222 list_del(&work->ordered_list);
214 spin_unlock(&workers->order_lock); 223 spin_unlock_irqrestore(lock, flags);
215 224
216 /* 225 /*
217 * we don't want to call the ordered free functions 226 * we don't want to call the ordered free functions
218 * with the lock held though 227 * with the lock held though
219 */ 228 */
220 work->ordered_free(work); 229 work->ordered_free(work);
221 spin_lock(&workers->order_lock); 230 trace_btrfs_all_work_done(work);
222 }
223
224 spin_unlock(&workers->order_lock);
225}
226
227static void put_worker(struct btrfs_worker_thread *worker)
228{
229 if (atomic_dec_and_test(&worker->refs))
230 kfree(worker);
231}
232
233static int try_worker_shutdown(struct btrfs_worker_thread *worker)
234{
235 int freeit = 0;
236
237 spin_lock_irq(&worker->lock);
238 spin_lock(&worker->workers->lock);
239 if (worker->workers->num_workers > 1 &&
240 worker->idle &&
241 !worker->working &&
242 !list_empty(&worker->worker_list) &&
243 list_empty(&worker->prio_pending) &&
244 list_empty(&worker->pending) &&
245 atomic_read(&worker->num_pending) == 0) {
246 freeit = 1;
247 list_del_init(&worker->worker_list);
248 worker->workers->num_workers--;
249 } 231 }
250 spin_unlock(&worker->workers->lock); 232 spin_unlock_irqrestore(lock, flags);
251 spin_unlock_irq(&worker->lock);
252
253 if (freeit)
254 put_worker(worker);
255 return freeit;
256} 233}
257 234
258static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, 235static void normal_work_helper(struct work_struct *arg)
259 struct list_head *prio_head,
260 struct list_head *head)
261{
262 struct btrfs_work *work = NULL;
263 struct list_head *cur = NULL;
264
265 if (!list_empty(prio_head))
266 cur = prio_head->next;
267
268 smp_mb();
269 if (!list_empty(&worker->prio_pending))
270 goto refill;
271
272 if (!list_empty(head))
273 cur = head->next;
274
275 if (cur)
276 goto out;
277
278refill:
279 spin_lock_irq(&worker->lock);
280 list_splice_tail_init(&worker->prio_pending, prio_head);
281 list_splice_tail_init(&worker->pending, head);
282
283 if (!list_empty(prio_head))
284 cur = prio_head->next;
285 else if (!list_empty(head))
286 cur = head->next;
287 spin_unlock_irq(&worker->lock);
288
289 if (!cur)
290 goto out_fail;
291
292out:
293 work = list_entry(cur, struct btrfs_work, list);
294
295out_fail:
296 return work;
297}
298
299/*
300 * main loop for servicing work items
301 */
302static int worker_loop(void *arg)
303{ 236{
304 struct btrfs_worker_thread *worker = arg;
305 struct list_head head;
306 struct list_head prio_head;
307 struct btrfs_work *work; 237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq;
239 int need_order = 0;
308 240
309 INIT_LIST_HEAD(&head); 241 work = container_of(arg, struct btrfs_work, normal_work);
310 INIT_LIST_HEAD(&prio_head); 242 /*
311 243 * We should not touch things inside work in the following cases:
312 do { 244 * 1) after work->func() if it has no ordered_free
313again: 245 * Since the struct is freed in work->func().
314 while (1) { 246 * 2) after setting WORK_DONE_BIT
315 247 * The work may be freed in other threads almost instantly.
316 248 * So we save the needed things here.
317 work = get_next_work(worker, &prio_head, &head); 249 */
318 if (!work) 250 if (work->ordered_func)
319 break; 251 need_order = 1;
320 252 wq = work->wq;
321 list_del(&work->list); 253
322 clear_bit(WORK_QUEUED_BIT, &work->flags); 254 trace_btrfs_work_sched(work);
323 255 thresh_exec_hook(wq);
324 work->worker = worker; 256 work->func(work);
325 257 if (need_order) {
326 work->func(work); 258 set_bit(WORK_DONE_BIT, &work->flags);
327 259 run_ordered_work(wq);
328 atomic_dec(&worker->num_pending);
329 /*
330 * unless this is an ordered work queue,
331 * 'work' was probably freed by func above.
332 */
333 run_ordered_completions(worker->workers, work);
334
335 check_pending_worker_creates(worker);
336 cond_resched();
337 }
338
339 spin_lock_irq(&worker->lock);
340 check_idle_worker(worker);
341
342 if (freezing(current)) {
343 worker->working = 0;
344 spin_unlock_irq(&worker->lock);
345 try_to_freeze();
346 } else {
347 spin_unlock_irq(&worker->lock);
348 if (!kthread_should_stop()) {
349 cpu_relax();
350 /*
351 * we've dropped the lock, did someone else
352 * jump_in?
353 */
354 smp_mb();
355 if (!list_empty(&worker->pending) ||
356 !list_empty(&worker->prio_pending))
357 continue;
358
359 /*
360 * this short schedule allows more work to
361 * come in without the queue functions
362 * needing to go through wake_up_process()
363 *
364 * worker->working is still 1, so nobody
365 * is going to try and wake us up
366 */
367 schedule_timeout(1);
368 smp_mb();
369 if (!list_empty(&worker->pending) ||
370 !list_empty(&worker->prio_pending))
371 continue;
372
373 if (kthread_should_stop())
374 break;
375
376 /* still no more work?, sleep for real */
377 spin_lock_irq(&worker->lock);
378 set_current_state(TASK_INTERRUPTIBLE);
379 if (!list_empty(&worker->pending) ||
380 !list_empty(&worker->prio_pending)) {
381 spin_unlock_irq(&worker->lock);
382 set_current_state(TASK_RUNNING);
383 goto again;
384 }
385
386 /*
387 * this makes sure we get a wakeup when someone
388 * adds something new to the queue
389 */
390 worker->working = 0;
391 spin_unlock_irq(&worker->lock);
392
393 if (!kthread_should_stop()) {
394 schedule_timeout(HZ * 120);
395 if (!worker->working &&
396 try_worker_shutdown(worker)) {
397 return 0;
398 }
399 }
400 }
401 __set_current_state(TASK_RUNNING);
402 }
403 } while (!kthread_should_stop());
404 return 0;
405}
406
407/*
408 * this will wait for all the worker threads to shutdown
409 */
410void btrfs_stop_workers(struct btrfs_workers *workers)
411{
412 struct list_head *cur;
413 struct btrfs_worker_thread *worker;
414 int can_stop;
415
416 spin_lock_irq(&workers->lock);
417 workers->stopping = 1;
418 list_splice_init(&workers->idle_list, &workers->worker_list);
419 while (!list_empty(&workers->worker_list)) {
420 cur = workers->worker_list.next;
421 worker = list_entry(cur, struct btrfs_worker_thread,
422 worker_list);
423
424 atomic_inc(&worker->refs);
425 workers->num_workers -= 1;
426 if (!list_empty(&worker->worker_list)) {
427 list_del_init(&worker->worker_list);
428 put_worker(worker);
429 can_stop = 1;
430 } else
431 can_stop = 0;
432 spin_unlock_irq(&workers->lock);
433 if (can_stop)
434 kthread_stop(worker->task);
435 spin_lock_irq(&workers->lock);
436 put_worker(worker);
437 } 260 }
438 spin_unlock_irq(&workers->lock); 261 if (!need_order)
262 trace_btrfs_all_work_done(work);
439} 263}
440 264
441/* 265void btrfs_init_work(struct btrfs_work *work,
442 * simple init on struct btrfs_workers 266 btrfs_func_t func,
443 */ 267 btrfs_func_t ordered_func,
444void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 268 btrfs_func_t ordered_free)
445 struct btrfs_workers *async_helper)
446{ 269{
447 workers->num_workers = 0; 270 work->func = func;
448 workers->num_workers_starting = 0; 271 work->ordered_func = ordered_func;
449 INIT_LIST_HEAD(&workers->worker_list); 272 work->ordered_free = ordered_free;
450 INIT_LIST_HEAD(&workers->idle_list); 273 INIT_WORK(&work->normal_work, normal_work_helper);
451 INIT_LIST_HEAD(&workers->order_list); 274 INIT_LIST_HEAD(&work->ordered_list);
452 INIT_LIST_HEAD(&workers->prio_order_list); 275 work->flags = 0;
453 spin_lock_init(&workers->lock);
454 spin_lock_init(&workers->order_lock);
455 workers->max_workers = max;
456 workers->idle_thresh = 32;
457 workers->name = name;
458 workers->ordered = 0;
459 workers->atomic_start_pending = 0;
460 workers->atomic_worker_start = async_helper;
461 workers->stopping = 0;
462} 276}
463 277
464/* 278static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
465 * starts new worker threads. This does not enforce the max worker 279 struct btrfs_work *work)
466 * count in case you need to temporarily go past it.
467 */
468static int __btrfs_start_workers(struct btrfs_workers *workers)
469{ 280{
470 struct btrfs_worker_thread *worker; 281 unsigned long flags;
471 int ret = 0;
472
473 worker = kzalloc(sizeof(*worker), GFP_NOFS);
474 if (!worker) {
475 ret = -ENOMEM;
476 goto fail;
477 }
478
479 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock);
483
484 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1);
486 worker->workers = workers;
487 worker->task = kthread_create(worker_loop, worker,
488 "btrfs-%s-%d", workers->name,
489 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task);
492 goto fail;
493 }
494 282
495 spin_lock_irq(&workers->lock); 283 work->wq = wq;
496 if (workers->stopping) { 284 thresh_queue_hook(wq);
497 spin_unlock_irq(&workers->lock); 285 if (work->ordered_func) {
498 ret = -EINVAL; 286 spin_lock_irqsave(&wq->list_lock, flags);
499 goto fail_kthread; 287 list_add_tail(&work->ordered_list, &wq->ordered_list);
288 spin_unlock_irqrestore(&wq->list_lock, flags);
500 } 289 }
501 list_add_tail(&worker->worker_list, &workers->idle_list); 290 queue_work(wq->normal_wq, &work->normal_work);
502 worker->idle = 1; 291 trace_btrfs_work_queued(work);
503 workers->num_workers++;
504 workers->num_workers_starting--;
505 WARN_ON(workers->num_workers_starting < 0);
506 spin_unlock_irq(&workers->lock);
507
508 wake_up_process(worker->task);
509 return 0;
510
511fail_kthread:
512 kthread_stop(worker->task);
513fail:
514 kfree(worker);
515 spin_lock_irq(&workers->lock);
516 workers->num_workers_starting--;
517 spin_unlock_irq(&workers->lock);
518 return ret;
519} 292}
520 293
521int btrfs_start_workers(struct btrfs_workers *workers) 294void btrfs_queue_work(struct btrfs_workqueue *wq,
295 struct btrfs_work *work)
522{ 296{
523 spin_lock_irq(&workers->lock); 297 struct __btrfs_workqueue *dest_wq;
524 workers->num_workers_starting++;
525 spin_unlock_irq(&workers->lock);
526 return __btrfs_start_workers(workers);
527}
528
529/*
530 * run through the list and find a worker thread that doesn't have a lot
531 * to do right now. This can return null if we aren't yet at the thread
532 * count limit and all of the threads are busy.
533 */
534static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
535{
536 struct btrfs_worker_thread *worker;
537 struct list_head *next;
538 int enforce_min;
539
540 enforce_min = (workers->num_workers + workers->num_workers_starting) <
541 workers->max_workers;
542
543 /*
544 * if we find an idle thread, don't move it to the end of the
545 * idle list. This improves the chance that the next submission
546 * will reuse the same thread, and maybe catch it while it is still
547 * working
548 */
549 if (!list_empty(&workers->idle_list)) {
550 next = workers->idle_list.next;
551 worker = list_entry(next, struct btrfs_worker_thread,
552 worker_list);
553 return worker;
554 }
555 if (enforce_min || list_empty(&workers->worker_list))
556 return NULL;
557
558 /*
559 * if we pick a busy task, move the task to the end of the list.
560 * hopefully this will keep things somewhat evenly balanced.
561 * Do the move in batches based on the sequence number. This groups
562 * requests submitted at roughly the same time onto the same worker.
563 */
564 next = workers->worker_list.next;
565 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
566 worker->sequence++;
567 298
568 if (worker->sequence % workers->idle_thresh == 0) 299 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
569 list_move_tail(next, &workers->worker_list); 300 dest_wq = wq->high;
570 return worker; 301 else
302 dest_wq = wq->normal;
303 __btrfs_queue_work(dest_wq, work);
571} 304}
572 305
573/* 306static inline void
574 * selects a worker thread to take the next job. This will either find 307__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
575 * an idle worker, start a new worker up to the max count, or just return
576 * one of the existing busy workers.
577 */
578static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
579{ 308{
580 struct btrfs_worker_thread *worker; 309 destroy_workqueue(wq->normal_wq);
581 unsigned long flags; 310 trace_btrfs_workqueue_destroy(wq);
582 struct list_head *fallback; 311 kfree(wq);
583 int ret;
584
585 spin_lock_irqsave(&workers->lock, flags);
586again:
587 worker = next_worker(workers);
588
589 if (!worker) {
590 if (workers->num_workers + workers->num_workers_starting >=
591 workers->max_workers) {
592 goto fallback;
593 } else if (workers->atomic_worker_start) {
594 workers->atomic_start_pending = 1;
595 goto fallback;
596 } else {
597 workers->num_workers_starting++;
598 spin_unlock_irqrestore(&workers->lock, flags);
599 /* we're below the limit, start another worker */
600 ret = __btrfs_start_workers(workers);
601 spin_lock_irqsave(&workers->lock, flags);
602 if (ret)
603 goto fallback;
604 goto again;
605 }
606 }
607 goto found;
608
609fallback:
610 fallback = NULL;
611 /*
612 * we have failed to find any workers, just
613 * return the first one we can find.
614 */
615 if (!list_empty(&workers->worker_list))
616 fallback = workers->worker_list.next;
617 if (!list_empty(&workers->idle_list))
618 fallback = workers->idle_list.next;
619 BUG_ON(!fallback);
620 worker = list_entry(fallback,
621 struct btrfs_worker_thread, worker_list);
622found:
623 /*
624 * this makes sure the worker doesn't exit before it is placed
625 * onto a busy/idle list
626 */
627 atomic_inc(&worker->num_pending);
628 spin_unlock_irqrestore(&workers->lock, flags);
629 return worker;
630} 312}
631 313
632/* 314void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
633 * btrfs_requeue_work just puts the work item back on the tail of the list
634 * it was taken from. It is intended for use with long running work functions
635 * that make some progress and want to give the cpu up for others.
636 */
637void btrfs_requeue_work(struct btrfs_work *work)
638{ 315{
639 struct btrfs_worker_thread *worker = work->worker; 316 if (!wq)
640 unsigned long flags;
641 int wake = 0;
642
643 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
644 return; 317 return;
645 318 if (wq->high)
646 spin_lock_irqsave(&worker->lock, flags); 319 __btrfs_destroy_workqueue(wq->high);
647 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 320 __btrfs_destroy_workqueue(wq->normal);
648 list_add_tail(&work->list, &worker->prio_pending); 321 kfree(wq);
649 else
650 list_add_tail(&work->list, &worker->pending);
651 atomic_inc(&worker->num_pending);
652
653 /* by definition we're busy, take ourselves off the idle
654 * list
655 */
656 if (worker->idle) {
657 spin_lock(&worker->workers->lock);
658 worker->idle = 0;
659 list_move_tail(&worker->worker_list,
660 &worker->workers->worker_list);
661 spin_unlock(&worker->workers->lock);
662 }
663 if (!worker->working) {
664 wake = 1;
665 worker->working = 1;
666 }
667
668 if (wake)
669 wake_up_process(worker->task);
670 spin_unlock_irqrestore(&worker->lock, flags);
671} 322}
672 323
673void btrfs_set_work_high_prio(struct btrfs_work *work) 324void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
674{ 325{
675 set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 326 wq->normal->max_active = max;
327 if (wq->high)
328 wq->high->max_active = max;
676} 329}
677 330
678/* 331void btrfs_set_work_high_priority(struct btrfs_work *work)
679 * places a struct btrfs_work into the pending queue of one of the kthreads
680 */
681void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
682{ 332{
683 struct btrfs_worker_thread *worker; 333 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
684 unsigned long flags;
685 int wake = 0;
686
687 /* don't requeue something already on a list */
688 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
689 return;
690
691 worker = find_worker(workers);
692 if (workers->ordered) {
693 /*
694 * you're not allowed to do ordered queues from an
695 * interrupt handler
696 */
697 spin_lock(&workers->order_lock);
698 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
699 list_add_tail(&work->order_list,
700 &workers->prio_order_list);
701 } else {
702 list_add_tail(&work->order_list, &workers->order_list);
703 }
704 spin_unlock(&workers->order_lock);
705 } else {
706 INIT_LIST_HEAD(&work->order_list);
707 }
708
709 spin_lock_irqsave(&worker->lock, flags);
710
711 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
712 list_add_tail(&work->list, &worker->prio_pending);
713 else
714 list_add_tail(&work->list, &worker->pending);
715 check_busy_worker(worker);
716
717 /*
718 * avoid calling into wake_up_process if this thread has already
719 * been kicked
720 */
721 if (!worker->working)
722 wake = 1;
723 worker->working = 1;
724
725 if (wake)
726 wake_up_process(worker->task);
727 spin_unlock_irqrestore(&worker->lock, flags);
728} 334}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
19#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
21 22
22struct btrfs_worker_thread; 23struct btrfs_workqueue;
24/* Internal use only */
25struct __btrfs_workqueue;
26struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg);
23 28
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work { 29struct btrfs_work {
39 /* 30 btrfs_func_t func;
40 * func should be set to the function you want called 31 btrfs_func_t ordered_func;
41 * your work struct is passed as the only arg 32 btrfs_func_t ordered_free;
42 * 33
43 * ordered_func must be set for work sent to an ordered work queue, 34 /* Don't touch things below */
44 * and it is called to complete a given work item in the same 35 struct work_struct normal_work;
45 * order they were sent to the queue. 36 struct list_head ordered_list;
46 */ 37 struct __btrfs_workqueue *wq;
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags; 38 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 int num_workers_starting;
68
69 /* max number of workers allowed. changed by btrfs_start_workers */
70 int max_workers;
71
72 /* once a worker has this many requests or fewer, it is idle */
73 int idle_thresh;
74
75 /* force completions in the order they were queued */
76 int ordered;
77
78 /* more workers required, but in an interrupt handler */
79 int atomic_start_pending;
80
81 /*
82 * are we allowed to sleep while starting workers or are we required
83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
85 */
86 struct btrfs_workers *atomic_worker_start;
87
88 /* list with all the work threads. The workers on the idle thread
89 * may be actively servicing jobs, but they haven't yet hit the
90 * idle thresh limit above.
91 */
92 struct list_head worker_list;
93 struct list_head idle_list;
94
95 /*
96 * when operating in ordered mode, this maintains the list
97 * of work items waiting for completion
98 */
99 struct list_head order_list;
100 struct list_head prio_order_list;
101
102 /* lock for finding the next worker thread to queue on */
103 spinlock_t lock;
104
105 /* lock for the ordered lists */
106 spinlock_t order_lock;
107
108 /* extra name for this worker, used for current->name */
109 char *name;
110
111 int stopping;
112}; 39};
113 40
114void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
115int btrfs_start_workers(struct btrfs_workers *workers); 42 int flags,
116void btrfs_stop_workers(struct btrfs_workers *workers); 43 int max_active,
117void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 44 int thresh);
118 struct btrfs_workers *async_starter); 45void btrfs_init_work(struct btrfs_work *work,
119void btrfs_requeue_work(struct btrfs_work *work); 46 btrfs_func_t func,
120void btrfs_set_work_high_prio(struct btrfs_work *work); 47 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free);
49void btrfs_queue_work(struct btrfs_workqueue *wq,
50 struct btrfs_work *work);
51void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
52void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
53void btrfs_set_work_high_priority(struct btrfs_work *work);
121#endif 54#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..aad7201ad11b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
220 220
221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
222 struct ulist *parents, struct __prelim_ref *ref, 222 struct ulist *parents, struct __prelim_ref *ref,
223 int level, u64 time_seq, const u64 *extent_item_pos) 223 int level, u64 time_seq, const u64 *extent_item_pos,
224 u64 total_refs)
224{ 225{
225 int ret = 0; 226 int ret = 0;
226 int slot; 227 int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
249 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 250 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
250 ret = btrfs_next_old_leaf(root, path, time_seq); 251 ret = btrfs_next_old_leaf(root, path, time_seq);
251 252
252 while (!ret && count < ref->count) { 253 while (!ret && count < total_refs) {
253 eb = path->nodes[0]; 254 eb = path->nodes[0];
254 slot = path->slots[0]; 255 slot = path->slots[0];
255 256
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
306 struct btrfs_path *path, u64 time_seq, 307 struct btrfs_path *path, u64 time_seq,
307 struct __prelim_ref *ref, 308 struct __prelim_ref *ref,
308 struct ulist *parents, 309 struct ulist *parents,
309 const u64 *extent_item_pos) 310 const u64 *extent_item_pos, u64 total_refs)
310{ 311{
311 struct btrfs_root *root; 312 struct btrfs_root *root;
312 struct btrfs_key root_key; 313 struct btrfs_key root_key;
@@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
361 } 362 }
362 363
363 ret = add_all_parents(root, path, parents, ref, level, time_seq, 364 ret = add_all_parents(root, path, parents, ref, level, time_seq,
364 extent_item_pos); 365 extent_item_pos, total_refs);
365out: 366out:
366 path->lowest_level = 0; 367 path->lowest_level = 0;
367 btrfs_release_path(path); 368 btrfs_release_path(path);
@@ -374,7 +375,7 @@ out:
374static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 375static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 struct btrfs_path *path, u64 time_seq, 376 struct btrfs_path *path, u64 time_seq,
376 struct list_head *head, 377 struct list_head *head,
377 const u64 *extent_item_pos) 378 const u64 *extent_item_pos, u64 total_refs)
378{ 379{
379 int err; 380 int err;
380 int ret = 0; 381 int ret = 0;
@@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
400 if (ref->count == 0) 401 if (ref->count == 0)
401 continue; 402 continue;
402 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 403 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
403 parents, extent_item_pos); 404 parents, extent_item_pos,
405 total_refs);
404 /* 406 /*
405 * we can only tolerate ENOENT,otherwise,we should catch error 407 * we can only tolerate ENOENT,otherwise,we should catch error
406 * and return directly. 408 * and return directly.
@@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode)
557 * smaller or equal that seq to the list 559 * smaller or equal that seq to the list
558 */ 560 */
559static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 561static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
560 struct list_head *prefs) 562 struct list_head *prefs, u64 *total_refs)
561{ 563{
562 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 564 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
563 struct rb_node *n = &head->node.rb_node; 565 struct rb_node *n = &head->node.rb_node;
@@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
593 default: 595 default:
594 BUG_ON(1); 596 BUG_ON(1);
595 } 597 }
598 *total_refs += (node->ref_mod * sgn);
596 switch (node->type) { 599 switch (node->type) {
597 case BTRFS_TREE_BLOCK_REF_KEY: { 600 case BTRFS_TREE_BLOCK_REF_KEY: {
598 struct btrfs_delayed_tree_ref *ref; 601 struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
653 */ 656 */
654static int __add_inline_refs(struct btrfs_fs_info *fs_info, 657static int __add_inline_refs(struct btrfs_fs_info *fs_info,
655 struct btrfs_path *path, u64 bytenr, 658 struct btrfs_path *path, u64 bytenr,
656 int *info_level, struct list_head *prefs) 659 int *info_level, struct list_head *prefs,
660 u64 *total_refs)
657{ 661{
658 int ret = 0; 662 int ret = 0;
659 int slot; 663 int slot;
@@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
677 681
678 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 682 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
679 flags = btrfs_extent_flags(leaf, ei); 683 flags = btrfs_extent_flags(leaf, ei);
684 *total_refs += btrfs_extent_refs(leaf, ei);
680 btrfs_item_key_to_cpu(leaf, &found_key, slot); 685 btrfs_item_key_to_cpu(leaf, &found_key, slot);
681 686
682 ptr = (unsigned long)(ei + 1); 687 ptr = (unsigned long)(ei + 1);
@@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
859 struct list_head prefs; 864 struct list_head prefs;
860 struct __prelim_ref *ref; 865 struct __prelim_ref *ref;
861 struct extent_inode_elem *eie = NULL; 866 struct extent_inode_elem *eie = NULL;
867 u64 total_refs = 0;
862 868
863 INIT_LIST_HEAD(&prefs); 869 INIT_LIST_HEAD(&prefs);
864 INIT_LIST_HEAD(&prefs_delayed); 870 INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +879,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
873 path = btrfs_alloc_path(); 879 path = btrfs_alloc_path();
874 if (!path) 880 if (!path)
875 return -ENOMEM; 881 return -ENOMEM;
876 if (!trans) 882 if (!trans) {
877 path->search_commit_root = 1; 883 path->search_commit_root = 1;
884 path->skip_locking = 1;
885 }
878 886
879 /* 887 /*
880 * grab both a lock on the path and a lock on the delayed ref head. 888 * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +923,7 @@ again:
915 } 923 }
916 spin_unlock(&delayed_refs->lock); 924 spin_unlock(&delayed_refs->lock);
917 ret = __add_delayed_refs(head, time_seq, 925 ret = __add_delayed_refs(head, time_seq,
918 &prefs_delayed); 926 &prefs_delayed, &total_refs);
919 mutex_unlock(&head->mutex); 927 mutex_unlock(&head->mutex);
920 if (ret) 928 if (ret)
921 goto out; 929 goto out;
@@ -936,7 +944,8 @@ again:
936 (key.type == BTRFS_EXTENT_ITEM_KEY || 944 (key.type == BTRFS_EXTENT_ITEM_KEY ||
937 key.type == BTRFS_METADATA_ITEM_KEY)) { 945 key.type == BTRFS_METADATA_ITEM_KEY)) {
938 ret = __add_inline_refs(fs_info, path, bytenr, 946 ret = __add_inline_refs(fs_info, path, bytenr,
939 &info_level, &prefs); 947 &info_level, &prefs,
948 &total_refs);
940 if (ret) 949 if (ret)
941 goto out; 950 goto out;
942 ret = __add_keyed_refs(fs_info, path, bytenr, 951 ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +965,7 @@ again:
956 __merge_refs(&prefs, 1); 965 __merge_refs(&prefs, 1);
957 966
958 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 967 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
959 extent_item_pos); 968 extent_item_pos, total_refs);
960 if (ret) 969 if (ret)
961 goto out; 970 goto out;
962 971
@@ -965,7 +974,7 @@ again:
965 while (!list_empty(&prefs)) { 974 while (!list_empty(&prefs)) {
966 ref = list_first_entry(&prefs, struct __prelim_ref, list); 975 ref = list_first_entry(&prefs, struct __prelim_ref, list);
967 WARN_ON(ref->count < 0); 976 WARN_ON(ref->count < 0);
968 if (ref->count && ref->root_id && ref->parent == 0) { 977 if (roots && ref->count && ref->root_id && ref->parent == 0) {
969 /* no parent == root of tree */ 978 /* no parent == root of tree */
970 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 979 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
971 if (ret < 0) 980 if (ret < 0)
@@ -1061,22 +1070,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1061 u64 time_seq, struct ulist **leafs, 1070 u64 time_seq, struct ulist **leafs,
1062 const u64 *extent_item_pos) 1071 const u64 *extent_item_pos)
1063{ 1072{
1064 struct ulist *tmp;
1065 int ret; 1073 int ret;
1066 1074
1067 tmp = ulist_alloc(GFP_NOFS);
1068 if (!tmp)
1069 return -ENOMEM;
1070 *leafs = ulist_alloc(GFP_NOFS); 1075 *leafs = ulist_alloc(GFP_NOFS);
1071 if (!*leafs) { 1076 if (!*leafs)
1072 ulist_free(tmp);
1073 return -ENOMEM; 1077 return -ENOMEM;
1074 }
1075 1078
1076 ret = find_parent_nodes(trans, fs_info, bytenr, 1079 ret = find_parent_nodes(trans, fs_info, bytenr,
1077 time_seq, *leafs, tmp, extent_item_pos); 1080 time_seq, *leafs, NULL, extent_item_pos);
1078 ulist_free(tmp);
1079
1080 if (ret < 0 && ret != -ENOENT) { 1081 if (ret < 0 && ret != -ENOENT) {
1081 free_leaf_list(*leafs); 1082 free_leaf_list(*leafs);
1082 return ret; 1083 return ret;
@@ -1333,38 +1334,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1333 if (ret < 0) 1334 if (ret < 0)
1334 return ret; 1335 return ret;
1335 1336
1336 while (1) { 1337 ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
1337 u32 nritems; 1338 if (ret) {
1338 if (path->slots[0] == 0) { 1339 if (ret > 0)
1339 btrfs_set_path_blocking(path); 1340 ret = -ENOENT;
1340 ret = btrfs_prev_leaf(fs_info->extent_root, path); 1341 return ret;
1341 if (ret != 0) {
1342 if (ret > 0) {
1343 pr_debug("logical %llu is not within "
1344 "any extent\n", logical);
1345 ret = -ENOENT;
1346 }
1347 return ret;
1348 }
1349 } else {
1350 path->slots[0]--;
1351 }
1352 nritems = btrfs_header_nritems(path->nodes[0]);
1353 if (nritems == 0) {
1354 pr_debug("logical %llu is not within any extent\n",
1355 logical);
1356 return -ENOENT;
1357 }
1358 if (path->slots[0] == nritems)
1359 path->slots[0]--;
1360
1361 btrfs_item_key_to_cpu(path->nodes[0], found_key,
1362 path->slots[0]);
1363 if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
1364 found_key->type == BTRFS_METADATA_ITEM_KEY)
1365 break;
1366 } 1342 }
1367 1343 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1344 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1345 size = fs_info->extent_root->leafsize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1346 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
109 u64 last_trans; 109 u64 last_trans;
110 110
111 /* 111 /*
112 * log transid when this inode was last modified 112 * transid that last logged this inode
113 */ 113 */
114 u64 last_sub_trans; 114 u64 logged_trans;
115 115
116 /* 116 /*
117 * transid that last logged this inode 117 * log transid when this inode was last modified
118 */ 118 */
119 u64 logged_trans; 119 int last_sub_trans;
120
121 /* a local copy of root's last_log_commit */
122 int last_log_commit;
120 123
121 /* total number of bytes pending delalloc, used by stat to calc the 124 /* total number of bytes pending delalloc, used by stat to calc the
122 * real block usage of the file 125 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
155 /* flags field from the on disk inode */ 158 /* flags field from the on disk inode */
156 u32 flags; 159 u32 flags;
157 160
158 /* a local copy of root's last_log_commit */
159 unsigned long last_log_commit;
160
161 /* 161 /*
162 * Counters to keep track of the number of extent item's we may use due 162 * Counters to keep track of the number of extent item's we may use due
163 * to delalloc and such. outstanding_extents is the number of extent 163 * to delalloc and such. outstanding_extents is the number of extent
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
472 rcu_read_lock(); 472 rcu_read_lock();
473 page = radix_tree_lookup(&mapping->page_tree, pg_index); 473 page = radix_tree_lookup(&mapping->page_tree, pg_index);
474 rcu_read_unlock(); 474 rcu_read_unlock();
475 if (page) { 475 if (page && !radix_tree_exceptional_entry(page)) {
476 misses++; 476 misses++;
477 if (misses > 4) 477 if (misses > 4)
478 break; 478 break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..88d1b1eedc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5376 int advance_right; 5376 int advance_right;
5377 u64 left_blockptr; 5377 u64 left_blockptr;
5378 u64 right_blockptr; 5378 u64 right_blockptr;
5379 u64 left_gen;
5380 u64 right_gen;
5379 u64 left_start_ctransid; 5381 u64 left_start_ctransid;
5380 u64 right_start_ctransid; 5382 u64 right_start_ctransid;
5381 u64 ctransid; 5383 u64 ctransid;
@@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5640 right_blockptr = btrfs_node_blockptr( 5642 right_blockptr = btrfs_node_blockptr(
5641 right_path->nodes[right_level], 5643 right_path->nodes[right_level],
5642 right_path->slots[right_level]); 5644 right_path->slots[right_level]);
5643 if (left_blockptr == right_blockptr) { 5645 left_gen = btrfs_node_ptr_generation(
5646 left_path->nodes[left_level],
5647 left_path->slots[left_level]);
5648 right_gen = btrfs_node_ptr_generation(
5649 right_path->nodes[right_level],
5650 right_path->slots[right_level]);
5651 if (left_blockptr == right_blockptr &&
5652 left_gen == right_gen) {
5644 /* 5653 /*
5645 * As we're on a shared block, don't 5654 * As we're on a shared block, don't
5646 * allow to go deeper. 5655 * allow to go deeper.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..bc96c03dd259 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
351#define BTRFS_FS_STATE_ERROR 0 351#define BTRFS_FS_STATE_ERROR 0
352#define BTRFS_FS_STATE_REMOUNTING 1 352#define BTRFS_FS_STATE_REMOUNTING 1
353#define BTRFS_FS_STATE_TRANS_ABORTED 2 353#define BTRFS_FS_STATE_TRANS_ABORTED 2
354#define BTRFS_FS_STATE_DEV_REPLACING 3
354 355
355/* Super block flags */ 356/* Super block flags */
356/* Errors detected */ 357/* Errors detected */
@@ -1489,6 +1490,7 @@ struct btrfs_fs_info {
1489 */ 1490 */
1490 struct list_head ordered_roots; 1491 struct list_head ordered_roots;
1491 1492
1493 struct mutex delalloc_root_mutex;
1492 spinlock_t delalloc_root_lock; 1494 spinlock_t delalloc_root_lock;
1493 /* all fs/file tree roots that have delalloc inodes. */ 1495 /* all fs/file tree roots that have delalloc inodes. */
1494 struct list_head delalloc_roots; 1496 struct list_head delalloc_roots;
@@ -1503,28 +1505,27 @@ struct btrfs_fs_info {
1503 * A third pool does submit_bio to avoid deadlocking with the other 1505 * A third pool does submit_bio to avoid deadlocking with the other
1504 * two 1506 * two
1505 */ 1507 */
1506 struct btrfs_workers generic_worker; 1508 struct btrfs_workqueue *workers;
1507 struct btrfs_workers workers; 1509 struct btrfs_workqueue *delalloc_workers;
1508 struct btrfs_workers delalloc_workers; 1510 struct btrfs_workqueue *flush_workers;
1509 struct btrfs_workers flush_workers; 1511 struct btrfs_workqueue *endio_workers;
1510 struct btrfs_workers endio_workers; 1512 struct btrfs_workqueue *endio_meta_workers;
1511 struct btrfs_workers endio_meta_workers; 1513 struct btrfs_workqueue *endio_raid56_workers;
1512 struct btrfs_workers endio_raid56_workers; 1514 struct btrfs_workqueue *rmw_workers;
1513 struct btrfs_workers rmw_workers; 1515 struct btrfs_workqueue *endio_meta_write_workers;
1514 struct btrfs_workers endio_meta_write_workers; 1516 struct btrfs_workqueue *endio_write_workers;
1515 struct btrfs_workers endio_write_workers; 1517 struct btrfs_workqueue *endio_freespace_worker;
1516 struct btrfs_workers endio_freespace_worker; 1518 struct btrfs_workqueue *submit_workers;
1517 struct btrfs_workers submit_workers; 1519 struct btrfs_workqueue *caching_workers;
1518 struct btrfs_workers caching_workers; 1520 struct btrfs_workqueue *readahead_workers;
1519 struct btrfs_workers readahead_workers;
1520 1521
1521 /* 1522 /*
1522 * fixup workers take dirty pages that didn't properly go through 1523 * fixup workers take dirty pages that didn't properly go through
1523 * the cow mechanism and make them safe to write. It happens 1524 * the cow mechanism and make them safe to write. It happens
1524 * for the sys_munmap function call path 1525 * for the sys_munmap function call path
1525 */ 1526 */
1526 struct btrfs_workers fixup_workers; 1527 struct btrfs_workqueue *fixup_workers;
1527 struct btrfs_workers delayed_workers; 1528 struct btrfs_workqueue *delayed_workers;
1528 struct task_struct *transaction_kthread; 1529 struct task_struct *transaction_kthread;
1529 struct task_struct *cleaner_kthread; 1530 struct task_struct *cleaner_kthread;
1530 int thread_pool_size; 1531 int thread_pool_size;
@@ -1604,9 +1605,9 @@ struct btrfs_fs_info {
1604 atomic_t scrub_cancel_req; 1605 atomic_t scrub_cancel_req;
1605 wait_queue_head_t scrub_pause_wait; 1606 wait_queue_head_t scrub_pause_wait;
1606 int scrub_workers_refcnt; 1607 int scrub_workers_refcnt;
1607 struct btrfs_workers scrub_workers; 1608 struct btrfs_workqueue *scrub_workers;
1608 struct btrfs_workers scrub_wr_completion_workers; 1609 struct btrfs_workqueue *scrub_wr_completion_workers;
1609 struct btrfs_workers scrub_nocow_workers; 1610 struct btrfs_workqueue *scrub_nocow_workers;
1610 1611
1611#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1612#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1612 u32 check_integrity_print_mask; 1613 u32 check_integrity_print_mask;
@@ -1647,7 +1648,7 @@ struct btrfs_fs_info {
1647 /* qgroup rescan items */ 1648 /* qgroup rescan items */
1648 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1649 struct mutex qgroup_rescan_lock; /* protects the progress item */
1649 struct btrfs_key qgroup_rescan_progress; 1650 struct btrfs_key qgroup_rescan_progress;
1650 struct btrfs_workers qgroup_rescan_workers; 1651 struct btrfs_workqueue *qgroup_rescan_workers;
1651 struct completion qgroup_rescan_completion; 1652 struct completion qgroup_rescan_completion;
1652 struct btrfs_work qgroup_rescan_work; 1653 struct btrfs_work qgroup_rescan_work;
1653 1654
@@ -1674,10 +1675,18 @@ struct btrfs_fs_info {
1674 1675
1675 atomic_t mutually_exclusive_operation_running; 1676 atomic_t mutually_exclusive_operation_running;
1676 1677
1678 struct percpu_counter bio_counter;
1679 wait_queue_head_t replace_wait;
1680
1677 struct semaphore uuid_tree_rescan_sem; 1681 struct semaphore uuid_tree_rescan_sem;
1678 unsigned int update_uuid_tree_gen:1; 1682 unsigned int update_uuid_tree_gen:1;
1679}; 1683};
1680 1684
1685struct btrfs_subvolume_writers {
1686 struct percpu_counter counter;
1687 wait_queue_head_t wait;
1688};
1689
1681/* 1690/*
1682 * in ram representation of the tree. extent_root is used for all allocations 1691 * in ram representation of the tree. extent_root is used for all allocations
1683 * and for the extent tree extent_root root. 1692 * and for the extent tree extent_root root.
@@ -1714,11 +1723,15 @@ struct btrfs_root {
1714 struct mutex log_mutex; 1723 struct mutex log_mutex;
1715 wait_queue_head_t log_writer_wait; 1724 wait_queue_head_t log_writer_wait;
1716 wait_queue_head_t log_commit_wait[2]; 1725 wait_queue_head_t log_commit_wait[2];
1726 struct list_head log_ctxs[2];
1717 atomic_t log_writers; 1727 atomic_t log_writers;
1718 atomic_t log_commit[2]; 1728 atomic_t log_commit[2];
1719 atomic_t log_batch; 1729 atomic_t log_batch;
1720 unsigned long log_transid; 1730 int log_transid;
1721 unsigned long last_log_commit; 1731 /* No matter the commit succeeds or not*/
1732 int log_transid_committed;
1733 /* Just be updated when the commit succeeds. */
1734 int last_log_commit;
1722 pid_t log_start_pid; 1735 pid_t log_start_pid;
1723 bool log_multiple_pids; 1736 bool log_multiple_pids;
1724 1737
@@ -1793,6 +1806,7 @@ struct btrfs_root {
1793 spinlock_t root_item_lock; 1806 spinlock_t root_item_lock;
1794 atomic_t refs; 1807 atomic_t refs;
1795 1808
1809 struct mutex delalloc_mutex;
1796 spinlock_t delalloc_lock; 1810 spinlock_t delalloc_lock;
1797 /* 1811 /*
1798 * all of the inodes that have delalloc bytes. It is possible for 1812 * all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1816,8 @@ struct btrfs_root {
1802 struct list_head delalloc_inodes; 1816 struct list_head delalloc_inodes;
1803 struct list_head delalloc_root; 1817 struct list_head delalloc_root;
1804 u64 nr_delalloc_inodes; 1818 u64 nr_delalloc_inodes;
1819
1820 struct mutex ordered_extent_mutex;
1805 /* 1821 /*
1806 * this is used by the balancing code to wait for all the pending 1822 * this is used by the balancing code to wait for all the pending
1807 * ordered extents 1823 * ordered extents
@@ -1822,6 +1838,8 @@ struct btrfs_root {
1822 * manipulation with the read-only status via SUBVOL_SETFLAGS 1838 * manipulation with the read-only status via SUBVOL_SETFLAGS
1823 */ 1839 */
1824 int send_in_progress; 1840 int send_in_progress;
1841 struct btrfs_subvolume_writers *subv_writers;
1842 atomic_t will_be_snapshoted;
1825}; 1843};
1826 1844
1827struct btrfs_ioctl_defrag_range_args { 1845struct btrfs_ioctl_defrag_range_args {
@@ -3346,6 +3364,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3346int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3364int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3347 struct btrfs_fs_info *fs_info); 3365 struct btrfs_fs_info *fs_info);
3348int __get_raid_index(u64 flags); 3366int __get_raid_index(u64 flags);
3367
3368int btrfs_start_nocow_write(struct btrfs_root *root);
3369void btrfs_end_nocow_write(struct btrfs_root *root);
3349/* ctree.c */ 3370/* ctree.c */
3350int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3371int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3351 int level, int *slot); 3372 int level, int *slot);
@@ -3723,7 +3744,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3723 u32 min_type); 3744 u32 min_type);
3724 3745
3725int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3746int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3726int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); 3747int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
3748 int nr);
3727int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3749int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3728 struct extent_state **cached_state); 3750 struct extent_state **cached_state);
3729int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3751int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4027,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
4005int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 4027int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4006 struct btrfs_scrub_progress *progress); 4028 struct btrfs_scrub_progress *progress);
4007 4029
4030/* dev-replace.c */
4031void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4032void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4033void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
4034
4008/* reada.c */ 4035/* reada.c */
4009struct reada_control { 4036struct reada_control {
4010 struct btrfs_root *root; /* tree to prefetch */ 4037 struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1392 return -ENOMEM; 1392 return -ENOMEM;
1393 1393
1394 async_work->delayed_root = delayed_root; 1394 async_work->delayed_root = delayed_root;
1395 async_work->work.func = btrfs_async_run_delayed_root; 1395 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
1396 async_work->work.flags = 0; 1396 NULL, NULL);
1397 async_work->nr = nr; 1397 async_work->nr = nr;
1398 1398
1399 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); 1399 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
1400 return 0; 1400 return 0;
1401} 1401}
1402 1402
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
199 */ 199 */
200static struct btrfs_delayed_ref_head * 200static struct btrfs_delayed_ref_head *
201find_ref_head(struct rb_root *root, u64 bytenr, 201find_ref_head(struct rb_root *root, u64 bytenr,
202 struct btrfs_delayed_ref_head **last, int return_bigger) 202 int return_bigger)
203{ 203{
204 struct rb_node *n; 204 struct rb_node *n;
205 struct btrfs_delayed_ref_head *entry; 205 struct btrfs_delayed_ref_head *entry;
206 int cmp = 0;
207 206
208again:
209 n = root->rb_node; 207 n = root->rb_node;
210 entry = NULL; 208 entry = NULL;
211 while (n) { 209 while (n) {
212 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 210 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
213 if (last)
214 *last = entry;
215 211
216 if (bytenr < entry->node.bytenr) 212 if (bytenr < entry->node.bytenr)
217 cmp = -1;
218 else if (bytenr > entry->node.bytenr)
219 cmp = 1;
220 else
221 cmp = 0;
222
223 if (cmp < 0)
224 n = n->rb_left; 213 n = n->rb_left;
225 else if (cmp > 0) 214 else if (bytenr > entry->node.bytenr)
226 n = n->rb_right; 215 n = n->rb_right;
227 else 216 else
228 return entry; 217 return entry;
229 } 218 }
230 if (entry && return_bigger) { 219 if (entry && return_bigger) {
231 if (cmp > 0) { 220 if (bytenr > entry->node.bytenr) {
232 n = rb_next(&entry->href_node); 221 n = rb_next(&entry->href_node);
233 if (!n) 222 if (!n)
234 n = rb_first(root); 223 n = rb_first(root);
235 entry = rb_entry(n, struct btrfs_delayed_ref_head, 224 entry = rb_entry(n, struct btrfs_delayed_ref_head,
236 href_node); 225 href_node);
237 bytenr = entry->node.bytenr; 226 return entry;
238 return_bigger = 0;
239 goto again;
240 } 227 }
241 return entry; 228 return entry;
242 } 229 }
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
415 402
416again: 403again:
417 start = delayed_refs->run_delayed_start; 404 start = delayed_refs->run_delayed_start;
418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 405 head = find_ref_head(&delayed_refs->href_root, start, 1);
419 if (!head && !loop) { 406 if (!head && !loop) {
420 delayed_refs->run_delayed_start = 0; 407 delayed_refs->run_delayed_start = 0;
421 start = 0; 408 start = 0;
422 loop = true; 409 loop = true;
423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 410 head = find_ref_head(&delayed_refs->href_root, start, 1);
424 if (!head) 411 if (!head)
425 return NULL; 412 return NULL;
426 } else if (!head && loop) { 413 } else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
508 ref = btrfs_delayed_node_to_head(update); 495 ref = btrfs_delayed_node_to_head(update);
509 BUG_ON(existing_ref->is_data != ref->is_data); 496 BUG_ON(existing_ref->is_data != ref->is_data);
510 497
498 spin_lock(&existing_ref->lock);
511 if (ref->must_insert_reserved) { 499 if (ref->must_insert_reserved) {
512 /* if the extent was freed and then 500 /* if the extent was freed and then
513 * reallocated before the delayed ref 501 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
549 * only need the lock for this case cause we could be processing it 537 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok. 538 * currently, for refs we just added we know we're a-ok.
551 */ 539 */
552 spin_lock(&existing_ref->lock);
553 existing->ref_mod += update->ref_mod; 540 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock); 541 spin_unlock(&existing_ref->lock);
555} 542}
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
898 struct btrfs_delayed_ref_root *delayed_refs; 885 struct btrfs_delayed_ref_root *delayed_refs;
899 886
900 delayed_refs = &trans->transaction->delayed_refs; 887 delayed_refs = &trans->transaction->delayed_refs;
901 return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); 888 return find_ref_head(&delayed_refs->href_root, bytenr, 0);
902} 889}
903 890
904void btrfs_delayed_ref_exit(void) 891void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
431 return ret; 431 return ret;
432} 432}
433 433
434/*
435 * blocked until all flighting bios are finished.
436 */
437static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
438{
439 s64 writers;
440 DEFINE_WAIT(wait);
441
442 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
443 do {
444 prepare_to_wait(&fs_info->replace_wait, &wait,
445 TASK_UNINTERRUPTIBLE);
446 writers = percpu_counter_sum(&fs_info->bio_counter);
447 if (writers)
448 schedule();
449 finish_wait(&fs_info->replace_wait, &wait);
450 } while (writers);
451}
452
453/*
454 * we have removed target device, it is safe to allow new bios request.
455 */
456static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
457{
458 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
459 if (waitqueue_active(&fs_info->replace_wait))
460 wake_up(&fs_info->replace_wait);
461}
462
434static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 463static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
435 int scrub_ret) 464 int scrub_ret)
436{ 465{
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
458 src_device = dev_replace->srcdev; 487 src_device = dev_replace->srcdev;
459 btrfs_dev_replace_unlock(dev_replace); 488 btrfs_dev_replace_unlock(dev_replace);
460 489
461 /* replace old device with new one in mapping tree */
462 if (!scrub_ret)
463 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
464 src_device,
465 tgt_device);
466
467 /* 490 /*
468 * flush all outstanding I/O and inode extent mappings before the 491 * flush all outstanding I/O and inode extent mappings before the
469 * copy operation is declared as being finished 492 * copy operation is declared as being finished
470 */ 493 */
471 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 494 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
472 if (ret) { 495 if (ret) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 496 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return ret; 497 return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
484 WARN_ON(ret); 507 WARN_ON(ret);
485 508
486 /* keep away write_all_supers() during the finishing procedure */ 509 /* keep away write_all_supers() during the finishing procedure */
510 mutex_lock(&root->fs_info->chunk_mutex);
487 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 511 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
488 btrfs_dev_replace_lock(dev_replace); 512 btrfs_dev_replace_lock(dev_replace);
489 dev_replace->replace_state = 513 dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
494 dev_replace->time_stopped = get_seconds(); 518 dev_replace->time_stopped = get_seconds();
495 dev_replace->item_needs_writeback = 1; 519 dev_replace->item_needs_writeback = 1;
496 520
497 if (scrub_ret) { 521 /* replace old device with new one in mapping tree */
522 if (!scrub_ret) {
523 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
524 src_device,
525 tgt_device);
526 } else {
498 printk_in_rcu(KERN_ERR 527 printk_in_rcu(KERN_ERR
499 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 528 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
500 src_device->missing ? "<missing disk>" : 529 src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
503 rcu_str_deref(tgt_device->name), scrub_ret); 532 rcu_str_deref(tgt_device->name), scrub_ret);
504 btrfs_dev_replace_unlock(dev_replace); 533 btrfs_dev_replace_unlock(dev_replace);
505 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 534 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
535 mutex_unlock(&root->fs_info->chunk_mutex);
506 if (tgt_device) 536 if (tgt_device)
507 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 537 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
508 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 538 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 562 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
533 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
534 564
565 btrfs_rm_dev_replace_blocked(fs_info);
566
535 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 567 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
536 568
569 btrfs_rm_dev_replace_unblocked(fs_info);
570
537 /* 571 /*
538 * this is again a consistent state where no dev_replace procedure 572 * this is again a consistent state where no dev_replace procedure
539 * is running, the target device is part of the filesystem, the 573 * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
543 */ 577 */
544 btrfs_dev_replace_unlock(dev_replace); 578 btrfs_dev_replace_unlock(dev_replace);
545 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 579 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
580 mutex_unlock(&root->fs_info->chunk_mutex);
546 581
547 /* write back the superblocks */ 582 /* write back the superblocks */
548 trans = btrfs_start_transaction(root, 0); 583 trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
862 mutex_unlock(&dev_replace->lock_management_lock); 897 mutex_unlock(&dev_replace->lock_management_lock);
863 } 898 }
864} 899}
900
901void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
902{
903 percpu_counter_inc(&fs_info->bio_counter);
904}
905
906void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
907{
908 percpu_counter_dec(&fs_info->bio_counter);
909
910 if (waitqueue_active(&fs_info->replace_wait))
911 wake_up(&fs_info->replace_wait);
912}
913
914void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
915{
916 DEFINE_WAIT(wait);
917again:
918 percpu_counter_inc(&fs_info->bio_counter);
919 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
920 btrfs_bio_counter_dec(fs_info);
921 wait_event(fs_info->replace_wait,
922 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
923 &fs_info->fs_state));
924 goto again;
925 }
926
927}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..bd0f752b797b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
678 678
679 fs_info = end_io_wq->info; 679 fs_info = end_io_wq->info;
680 end_io_wq->error = err; 680 end_io_wq->error = err;
681 end_io_wq->work.func = end_workqueue_fn; 681 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
682 end_io_wq->work.flags = 0;
683 682
684 if (bio->bi_rw & REQ_WRITE) { 683 if (bio->bi_rw & REQ_WRITE) {
685 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 684 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
686 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 685 btrfs_queue_work(fs_info->endio_meta_write_workers,
687 &end_io_wq->work); 686 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 687 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
689 btrfs_queue_worker(&fs_info->endio_freespace_worker, 688 btrfs_queue_work(fs_info->endio_freespace_worker,
690 &end_io_wq->work); 689 &end_io_wq->work);
691 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 690 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
692 btrfs_queue_worker(&fs_info->endio_raid56_workers, 691 btrfs_queue_work(fs_info->endio_raid56_workers,
693 &end_io_wq->work); 692 &end_io_wq->work);
694 else 693 else
695 btrfs_queue_worker(&fs_info->endio_write_workers, 694 btrfs_queue_work(fs_info->endio_write_workers,
696 &end_io_wq->work); 695 &end_io_wq->work);
697 } else { 696 } else {
698 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 697 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
699 btrfs_queue_worker(&fs_info->endio_raid56_workers, 698 btrfs_queue_work(fs_info->endio_raid56_workers,
700 &end_io_wq->work); 699 &end_io_wq->work);
701 else if (end_io_wq->metadata) 700 else if (end_io_wq->metadata)
702 btrfs_queue_worker(&fs_info->endio_meta_workers, 701 btrfs_queue_work(fs_info->endio_meta_workers,
703 &end_io_wq->work); 702 &end_io_wq->work);
704 else 703 else
705 btrfs_queue_worker(&fs_info->endio_workers, 704 btrfs_queue_work(fs_info->endio_workers,
706 &end_io_wq->work); 705 &end_io_wq->work);
707 } 706 }
708} 707}
709 708
@@ -738,7 +737,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
738unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) 737unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
739{ 738{
740 unsigned long limit = min_t(unsigned long, 739 unsigned long limit = min_t(unsigned long,
741 info->workers.max_workers, 740 info->thread_pool_size,
742 info->fs_devices->open_devices); 741 info->fs_devices->open_devices);
743 return 256 * limit; 742 return 256 * limit;
744} 743}
@@ -811,11 +810,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
811 async->submit_bio_start = submit_bio_start; 810 async->submit_bio_start = submit_bio_start;
812 async->submit_bio_done = submit_bio_done; 811 async->submit_bio_done = submit_bio_done;
813 812
814 async->work.func = run_one_async_start; 813 btrfs_init_work(&async->work, run_one_async_start,
815 async->work.ordered_func = run_one_async_done; 814 run_one_async_done, run_one_async_free);
816 async->work.ordered_free = run_one_async_free;
817 815
818 async->work.flags = 0;
819 async->bio_flags = bio_flags; 816 async->bio_flags = bio_flags;
820 async->bio_offset = bio_offset; 817 async->bio_offset = bio_offset;
821 818
@@ -824,9 +821,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
824 atomic_inc(&fs_info->nr_async_submits); 821 atomic_inc(&fs_info->nr_async_submits);
825 822
826 if (rw & REQ_SYNC) 823 if (rw & REQ_SYNC)
827 btrfs_set_work_high_prio(&async->work); 824 btrfs_set_work_high_priority(&async->work);
828 825
829 btrfs_queue_worker(&fs_info->workers, &async->work); 826 btrfs_queue_work(fs_info->workers, &async->work);
830 827
831 while (atomic_read(&fs_info->async_submit_draining) && 828 while (atomic_read(&fs_info->async_submit_draining) &&
832 atomic_read(&fs_info->nr_async_submits)) { 829 atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1146,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1149 } 1146 }
1150} 1147}
1151 1148
1149static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1150{
1151 struct btrfs_subvolume_writers *writers;
1152 int ret;
1153
1154 writers = kmalloc(sizeof(*writers), GFP_NOFS);
1155 if (!writers)
1156 return ERR_PTR(-ENOMEM);
1157
1158 ret = percpu_counter_init(&writers->counter, 0);
1159 if (ret < 0) {
1160 kfree(writers);
1161 return ERR_PTR(ret);
1162 }
1163
1164 init_waitqueue_head(&writers->wait);
1165 return writers;
1166}
1167
1168static void
1169btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1170{
1171 percpu_counter_destroy(&writers->counter);
1172 kfree(writers);
1173}
1174
1152static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1175static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 u32 stripesize, struct btrfs_root *root, 1176 u32 stripesize, struct btrfs_root *root,
1154 struct btrfs_fs_info *fs_info, 1177 struct btrfs_fs_info *fs_info,
@@ -1194,16 +1217,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1194 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
1195 mutex_init(&root->objectid_mutex); 1218 mutex_init(&root->objectid_mutex);
1196 mutex_init(&root->log_mutex); 1219 mutex_init(&root->log_mutex);
1220 mutex_init(&root->ordered_extent_mutex);
1221 mutex_init(&root->delalloc_mutex);
1197 init_waitqueue_head(&root->log_writer_wait); 1222 init_waitqueue_head(&root->log_writer_wait);
1198 init_waitqueue_head(&root->log_commit_wait[0]); 1223 init_waitqueue_head(&root->log_commit_wait[0]);
1199 init_waitqueue_head(&root->log_commit_wait[1]); 1224 init_waitqueue_head(&root->log_commit_wait[1]);
1225 INIT_LIST_HEAD(&root->log_ctxs[0]);
1226 INIT_LIST_HEAD(&root->log_ctxs[1]);
1200 atomic_set(&root->log_commit[0], 0); 1227 atomic_set(&root->log_commit[0], 0);
1201 atomic_set(&root->log_commit[1], 0); 1228 atomic_set(&root->log_commit[1], 0);
1202 atomic_set(&root->log_writers, 0); 1229 atomic_set(&root->log_writers, 0);
1203 atomic_set(&root->log_batch, 0); 1230 atomic_set(&root->log_batch, 0);
1204 atomic_set(&root->orphan_inodes, 0); 1231 atomic_set(&root->orphan_inodes, 0);
1205 atomic_set(&root->refs, 1); 1232 atomic_set(&root->refs, 1);
1233 atomic_set(&root->will_be_snapshoted, 0);
1206 root->log_transid = 0; 1234 root->log_transid = 0;
1235 root->log_transid_committed = -1;
1207 root->last_log_commit = 0; 1236 root->last_log_commit = 0;
1208 if (fs_info) 1237 if (fs_info)
1209 extent_io_tree_init(&root->dirty_log_pages, 1238 extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1446,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1417 WARN_ON(root->log_root); 1446 WARN_ON(root->log_root);
1418 root->log_root = log_root; 1447 root->log_root = log_root;
1419 root->log_transid = 0; 1448 root->log_transid = 0;
1449 root->log_transid_committed = -1;
1420 root->last_log_commit = 0; 1450 root->last_log_commit = 0;
1421 return 0; 1451 return 0;
1422} 1452}
@@ -1498,6 +1528,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1498int btrfs_init_fs_root(struct btrfs_root *root) 1528int btrfs_init_fs_root(struct btrfs_root *root)
1499{ 1529{
1500 int ret; 1530 int ret;
1531 struct btrfs_subvolume_writers *writers;
1501 1532
1502 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1533 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1503 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1534 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,6 +1538,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1507 goto fail; 1538 goto fail;
1508 } 1539 }
1509 1540
1541 writers = btrfs_alloc_subvolume_writers();
1542 if (IS_ERR(writers)) {
1543 ret = PTR_ERR(writers);
1544 goto fail;
1545 }
1546 root->subv_writers = writers;
1547
1510 btrfs_init_free_ino_ctl(root); 1548 btrfs_init_free_ino_ctl(root);
1511 mutex_init(&root->fs_commit_mutex); 1549 mutex_init(&root->fs_commit_mutex);
1512 spin_lock_init(&root->cache_lock); 1550 spin_lock_init(&root->cache_lock);
@@ -1514,8 +1552,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1514 1552
1515 ret = get_anon_bdev(&root->anon_dev); 1553 ret = get_anon_bdev(&root->anon_dev);
1516 if (ret) 1554 if (ret)
1517 goto fail; 1555 goto free_writers;
1518 return 0; 1556 return 0;
1557
1558free_writers:
1559 btrfs_free_subvolume_writers(root->subv_writers);
1519fail: 1560fail:
1520 kfree(root->free_ino_ctl); 1561 kfree(root->free_ino_ctl);
1521 kfree(root->free_ino_pinned); 1562 kfree(root->free_ino_pinned);
@@ -1990,23 +2031,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
1990/* helper to cleanup workers */ 2031/* helper to cleanup workers */
1991static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) 2032static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1992{ 2033{
1993 btrfs_stop_workers(&fs_info->generic_worker); 2034 btrfs_destroy_workqueue(fs_info->fixup_workers);
1994 btrfs_stop_workers(&fs_info->fixup_workers); 2035 btrfs_destroy_workqueue(fs_info->delalloc_workers);
1995 btrfs_stop_workers(&fs_info->delalloc_workers); 2036 btrfs_destroy_workqueue(fs_info->workers);
1996 btrfs_stop_workers(&fs_info->workers); 2037 btrfs_destroy_workqueue(fs_info->endio_workers);
1997 btrfs_stop_workers(&fs_info->endio_workers); 2038 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
1998 btrfs_stop_workers(&fs_info->endio_meta_workers); 2039 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1999 btrfs_stop_workers(&fs_info->endio_raid56_workers); 2040 btrfs_destroy_workqueue(fs_info->rmw_workers);
2000 btrfs_stop_workers(&fs_info->rmw_workers); 2041 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2001 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2042 btrfs_destroy_workqueue(fs_info->endio_write_workers);
2002 btrfs_stop_workers(&fs_info->endio_write_workers); 2043 btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2003 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2044 btrfs_destroy_workqueue(fs_info->submit_workers);
2004 btrfs_stop_workers(&fs_info->submit_workers); 2045 btrfs_destroy_workqueue(fs_info->delayed_workers);
2005 btrfs_stop_workers(&fs_info->delayed_workers); 2046 btrfs_destroy_workqueue(fs_info->caching_workers);
2006 btrfs_stop_workers(&fs_info->caching_workers); 2047 btrfs_destroy_workqueue(fs_info->readahead_workers);
2007 btrfs_stop_workers(&fs_info->readahead_workers); 2048 btrfs_destroy_workqueue(fs_info->flush_workers);
2008 btrfs_stop_workers(&fs_info->flush_workers); 2049 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2009 btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
2010} 2050}
2011 2051
2012static void free_root_extent_buffers(struct btrfs_root *root) 2052static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2137,8 @@ int open_ctree(struct super_block *sb,
2097 int err = -EINVAL; 2137 int err = -EINVAL;
2098 int num_backups_tried = 0; 2138 int num_backups_tried = 0;
2099 int backup_index = 0; 2139 int backup_index = 0;
2140 int max_active;
2141 int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2100 bool create_uuid_tree; 2142 bool create_uuid_tree;
2101 bool check_uuid_tree; 2143 bool check_uuid_tree;
2102 2144
@@ -2133,10 +2175,16 @@ int open_ctree(struct super_block *sb,
2133 goto fail_dirty_metadata_bytes; 2175 goto fail_dirty_metadata_bytes;
2134 } 2176 }
2135 2177
2178 ret = percpu_counter_init(&fs_info->bio_counter, 0);
2179 if (ret) {
2180 err = ret;
2181 goto fail_delalloc_bytes;
2182 }
2183
2136 fs_info->btree_inode = new_inode(sb); 2184 fs_info->btree_inode = new_inode(sb);
2137 if (!fs_info->btree_inode) { 2185 if (!fs_info->btree_inode) {
2138 err = -ENOMEM; 2186 err = -ENOMEM;
2139 goto fail_delalloc_bytes; 2187 goto fail_bio_counter;
2140 } 2188 }
2141 2189
2142 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2190 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2207,7 @@ int open_ctree(struct super_block *sb,
2159 spin_lock_init(&fs_info->buffer_lock); 2207 spin_lock_init(&fs_info->buffer_lock);
2160 rwlock_init(&fs_info->tree_mod_log_lock); 2208 rwlock_init(&fs_info->tree_mod_log_lock);
2161 mutex_init(&fs_info->reloc_mutex); 2209 mutex_init(&fs_info->reloc_mutex);
2210 mutex_init(&fs_info->delalloc_root_mutex);
2162 seqlock_init(&fs_info->profiles_lock); 2211 seqlock_init(&fs_info->profiles_lock);
2163 2212
2164 init_completion(&fs_info->kobj_unregister); 2213 init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2260,7 @@ int open_ctree(struct super_block *sb,
2211 atomic_set(&fs_info->scrub_pause_req, 0); 2260 atomic_set(&fs_info->scrub_pause_req, 0);
2212 atomic_set(&fs_info->scrubs_paused, 0); 2261 atomic_set(&fs_info->scrubs_paused, 0);
2213 atomic_set(&fs_info->scrub_cancel_req, 0); 2262 atomic_set(&fs_info->scrub_cancel_req, 0);
2263 init_waitqueue_head(&fs_info->replace_wait);
2214 init_waitqueue_head(&fs_info->scrub_pause_wait); 2264 init_waitqueue_head(&fs_info->scrub_pause_wait);
2215 fs_info->scrub_workers_refcnt = 0; 2265 fs_info->scrub_workers_refcnt = 0;
2216#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2266#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2458,104 +2508,68 @@ int open_ctree(struct super_block *sb,
2458 goto fail_alloc; 2508 goto fail_alloc;
2459 } 2509 }
2460 2510
2461 btrfs_init_workers(&fs_info->generic_worker, 2511 max_active = fs_info->thread_pool_size;
2462 "genwork", 1, NULL);
2463
2464 btrfs_init_workers(&fs_info->workers, "worker",
2465 fs_info->thread_pool_size,
2466 &fs_info->generic_worker);
2467 2512
2468 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 2513 fs_info->workers =
2469 fs_info->thread_pool_size, NULL); 2514 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2515 max_active, 16);
2470 2516
2471 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2517 fs_info->delalloc_workers =
2472 fs_info->thread_pool_size, NULL); 2518 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2473 2519
2474 btrfs_init_workers(&fs_info->submit_workers, "submit", 2520 fs_info->flush_workers =
2475 min_t(u64, fs_devices->num_devices, 2521 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2476 fs_info->thread_pool_size), NULL);
2477 2522
2478 btrfs_init_workers(&fs_info->caching_workers, "cache", 2523 fs_info->caching_workers =
2479 fs_info->thread_pool_size, NULL); 2524 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2480 2525
2481 /* a higher idle thresh on the submit workers makes it much more 2526 /*
2527 * a higher idle thresh on the submit workers makes it much more
2482 * likely that bios will be send down in a sane order to the 2528 * likely that bios will be send down in a sane order to the
2483 * devices 2529 * devices
2484 */ 2530 */
2485 fs_info->submit_workers.idle_thresh = 64; 2531 fs_info->submit_workers =
2486 2532 btrfs_alloc_workqueue("submit", flags,
2487 fs_info->workers.idle_thresh = 16; 2533 min_t(u64, fs_devices->num_devices,
2488 fs_info->workers.ordered = 1; 2534 max_active), 64);
2489 2535
2490 fs_info->delalloc_workers.idle_thresh = 2; 2536 fs_info->fixup_workers =
2491 fs_info->delalloc_workers.ordered = 1; 2537 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2492
2493 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2494 &fs_info->generic_worker);
2495 btrfs_init_workers(&fs_info->endio_workers, "endio",
2496 fs_info->thread_pool_size,
2497 &fs_info->generic_worker);
2498 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2499 fs_info->thread_pool_size,
2500 &fs_info->generic_worker);
2501 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2502 "endio-meta-write", fs_info->thread_pool_size,
2503 &fs_info->generic_worker);
2504 btrfs_init_workers(&fs_info->endio_raid56_workers,
2505 "endio-raid56", fs_info->thread_pool_size,
2506 &fs_info->generic_worker);
2507 btrfs_init_workers(&fs_info->rmw_workers,
2508 "rmw", fs_info->thread_pool_size,
2509 &fs_info->generic_worker);
2510 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2511 fs_info->thread_pool_size,
2512 &fs_info->generic_worker);
2513 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2514 1, &fs_info->generic_worker);
2515 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2516 fs_info->thread_pool_size,
2517 &fs_info->generic_worker);
2518 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2519 fs_info->thread_pool_size,
2520 &fs_info->generic_worker);
2521 btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
2522 &fs_info->generic_worker);
2523 2538
2524 /* 2539 /*
2525 * endios are largely parallel and should have a very 2540 * endios are largely parallel and should have a very
2526 * low idle thresh 2541 * low idle thresh
2527 */ 2542 */
2528 fs_info->endio_workers.idle_thresh = 4; 2543 fs_info->endio_workers =
2529 fs_info->endio_meta_workers.idle_thresh = 4; 2544 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2530 fs_info->endio_raid56_workers.idle_thresh = 4; 2545 fs_info->endio_meta_workers =
2531 fs_info->rmw_workers.idle_thresh = 2; 2546 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2532 2547 fs_info->endio_meta_write_workers =
2533 fs_info->endio_write_workers.idle_thresh = 2; 2548 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2534 fs_info->endio_meta_write_workers.idle_thresh = 2; 2549 fs_info->endio_raid56_workers =
2535 fs_info->readahead_workers.idle_thresh = 2; 2550 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2536 2551 fs_info->rmw_workers =
2537 /* 2552 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2538 * btrfs_start_workers can really only fail because of ENOMEM so just 2553 fs_info->endio_write_workers =
2539 * return -ENOMEM if any of these fail. 2554 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2540 */ 2555 fs_info->endio_freespace_worker =
2541 ret = btrfs_start_workers(&fs_info->workers); 2556 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2542 ret |= btrfs_start_workers(&fs_info->generic_worker); 2557 fs_info->delayed_workers =
2543 ret |= btrfs_start_workers(&fs_info->submit_workers); 2558 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2544 ret |= btrfs_start_workers(&fs_info->delalloc_workers); 2559 fs_info->readahead_workers =
2545 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2560 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2546 ret |= btrfs_start_workers(&fs_info->endio_workers); 2561 fs_info->qgroup_rescan_workers =
2547 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2562 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2548 ret |= btrfs_start_workers(&fs_info->rmw_workers); 2563
2549 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2564 if (!(fs_info->workers && fs_info->delalloc_workers &&
2550 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2565 fs_info->submit_workers && fs_info->flush_workers &&
2551 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2566 fs_info->endio_workers && fs_info->endio_meta_workers &&
2552 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2567 fs_info->endio_meta_write_workers &&
2553 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2568 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2554 ret |= btrfs_start_workers(&fs_info->caching_workers); 2569 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2555 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2570 fs_info->caching_workers && fs_info->readahead_workers &&
2556 ret |= btrfs_start_workers(&fs_info->flush_workers); 2571 fs_info->fixup_workers && fs_info->delayed_workers &&
2557 ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); 2572 fs_info->qgroup_rescan_workers)) {
2558 if (ret) {
2559 err = -ENOMEM; 2573 err = -ENOMEM;
2560 goto fail_sb_buffer; 2574 goto fail_sb_buffer;
2561 } 2575 }
@@ -2963,6 +2977,8 @@ fail_iput:
2963 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2977 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2964 2978
2965 iput(fs_info->btree_inode); 2979 iput(fs_info->btree_inode);
2980fail_bio_counter:
2981 percpu_counter_destroy(&fs_info->bio_counter);
2966fail_delalloc_bytes: 2982fail_delalloc_bytes:
2967 percpu_counter_destroy(&fs_info->delalloc_bytes); 2983 percpu_counter_destroy(&fs_info->delalloc_bytes);
2968fail_dirty_metadata_bytes: 2984fail_dirty_metadata_bytes:
@@ -3244,6 +3260,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3244 /* send down all the barriers */ 3260 /* send down all the barriers */
3245 head = &info->fs_devices->devices; 3261 head = &info->fs_devices->devices;
3246 list_for_each_entry_rcu(dev, head, dev_list) { 3262 list_for_each_entry_rcu(dev, head, dev_list) {
3263 if (dev->missing)
3264 continue;
3247 if (!dev->bdev) { 3265 if (!dev->bdev) {
3248 errors_send++; 3266 errors_send++;
3249 continue; 3267 continue;
@@ -3258,6 +3276,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3258 3276
3259 /* wait for all the barriers */ 3277 /* wait for all the barriers */
3260 list_for_each_entry_rcu(dev, head, dev_list) { 3278 list_for_each_entry_rcu(dev, head, dev_list) {
3279 if (dev->missing)
3280 continue;
3261 if (!dev->bdev) { 3281 if (!dev->bdev) {
3262 errors_wait++; 3282 errors_wait++;
3263 continue; 3283 continue;
@@ -3477,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
3477 root->orphan_block_rsv = NULL; 3497 root->orphan_block_rsv = NULL;
3478 if (root->anon_dev) 3498 if (root->anon_dev)
3479 free_anon_bdev(root->anon_dev); 3499 free_anon_bdev(root->anon_dev);
3500 if (root->subv_writers)
3501 btrfs_free_subvolume_writers(root->subv_writers);
3480 free_extent_buffer(root->node); 3502 free_extent_buffer(root->node);
3481 free_extent_buffer(root->commit_root); 3503 free_extent_buffer(root->commit_root);
3482 kfree(root->free_ino_ctl); 3504 kfree(root->free_ino_ctl);
@@ -3610,6 +3632,7 @@ int close_ctree(struct btrfs_root *root)
3610 3632
3611 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 3633 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3612 percpu_counter_destroy(&fs_info->delalloc_bytes); 3634 percpu_counter_destroy(&fs_info->delalloc_bytes);
3635 percpu_counter_destroy(&fs_info->bio_counter);
3613 bdi_destroy(&fs_info->bdi); 3636 bdi_destroy(&fs_info->bdi);
3614 cleanup_srcu_struct(&fs_info->subvol_srcu); 3637 cleanup_srcu_struct(&fs_info->subvol_srcu);
3615 3638
@@ -3791,9 +3814,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3791 list_move_tail(&root->ordered_root, 3814 list_move_tail(&root->ordered_root,
3792 &fs_info->ordered_roots); 3815 &fs_info->ordered_roots);
3793 3816
3817 spin_unlock(&fs_info->ordered_root_lock);
3794 btrfs_destroy_ordered_extents(root); 3818 btrfs_destroy_ordered_extents(root);
3795 3819
3796 cond_resched_lock(&fs_info->ordered_root_lock); 3820 cond_resched();
3821 spin_lock(&fs_info->ordered_root_lock);
3797 } 3822 }
3798 spin_unlock(&fs_info->ordered_root_lock); 3823 spin_unlock(&fs_info->ordered_root_lock);
3799} 3824}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..c6b6a6e3e735 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
549 caching_ctl->block_group = cache; 549 caching_ctl->block_group = cache;
550 caching_ctl->progress = cache->key.objectid; 550 caching_ctl->progress = cache->key.objectid;
551 atomic_set(&caching_ctl->count, 1); 551 atomic_set(&caching_ctl->count, 1);
552 caching_ctl->work.func = caching_thread; 552 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
553 553
554 spin_lock(&cache->lock); 554 spin_lock(&cache->lock);
555 /* 555 /*
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
640 640
641 btrfs_get_block_group(cache); 641 btrfs_get_block_group(cache);
642 642
643 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 643 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
644 644
645 return ret; 645 return ret;
646} 646}
@@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root,
3971} 3971}
3972 3972
3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3974 unsigned long nr_pages) 3974 unsigned long nr_pages, int nr_items)
3975{ 3975{
3976 struct super_block *sb = root->fs_info->sb; 3976 struct super_block *sb = root->fs_info->sb;
3977 3977
@@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3986 * the filesystem is readonly(all dirty pages are written to 3986 * the filesystem is readonly(all dirty pages are written to
3987 * the disk). 3987 * the disk).
3988 */ 3988 */
3989 btrfs_start_delalloc_roots(root->fs_info, 0); 3989 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
3990 if (!current->journal_info) 3990 if (!current->journal_info)
3991 btrfs_wait_ordered_roots(root->fs_info, -1); 3991 btrfs_wait_ordered_roots(root->fs_info, nr_items);
3992 } 3992 }
3993} 3993}
3994 3994
@@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4045 while (delalloc_bytes && loops < 3) { 4045 while (delalloc_bytes && loops < 3) {
4046 max_reclaim = min(delalloc_bytes, to_reclaim); 4046 max_reclaim = min(delalloc_bytes, to_reclaim);
4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4048 btrfs_writeback_inodes_sb_nr(root, nr_pages); 4048 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4049 /* 4049 /*
4050 * We need to wait for the async pages to actually start before 4050 * We need to wait for the async pages to actually start before
4051 * we do anything. 4051 * we do anything.
@@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root,
4112 goto commit; 4112 goto commit;
4113 4113
4114 /* See if there is enough pinned space to make this reservation */ 4114 /* See if there is enough pinned space to make this reservation */
4115 spin_lock(&space_info->lock);
4116 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4115 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4117 bytes) >= 0) { 4116 bytes) >= 0)
4118 spin_unlock(&space_info->lock);
4119 goto commit; 4117 goto commit;
4120 }
4121 spin_unlock(&space_info->lock);
4122 4118
4123 /* 4119 /*
4124 * See if there is some space in the delayed insertion reservation for 4120 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root,
4127 if (space_info != delayed_rsv->space_info) 4123 if (space_info != delayed_rsv->space_info)
4128 return -ENOSPC; 4124 return -ENOSPC;
4129 4125
4130 spin_lock(&space_info->lock);
4131 spin_lock(&delayed_rsv->lock); 4126 spin_lock(&delayed_rsv->lock);
4132 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4127 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4133 bytes - delayed_rsv->size) >= 0) { 4128 bytes - delayed_rsv->size) >= 0) {
4134 spin_unlock(&delayed_rsv->lock); 4129 spin_unlock(&delayed_rsv->lock);
4135 spin_unlock(&space_info->lock);
4136 return -ENOSPC; 4130 return -ENOSPC;
4137 } 4131 }
4138 spin_unlock(&delayed_rsv->lock); 4132 spin_unlock(&delayed_rsv->lock);
4139 spin_unlock(&space_info->lock);
4140 4133
4141commit: 4134commit:
4142 trans = btrfs_join_transaction(root); 4135 trans = btrfs_join_transaction(root);
@@ -4181,7 +4174,7 @@ static int flush_space(struct btrfs_root *root,
4181 break; 4174 break;
4182 case FLUSH_DELALLOC: 4175 case FLUSH_DELALLOC:
4183 case FLUSH_DELALLOC_WAIT: 4176 case FLUSH_DELALLOC_WAIT:
4184 shrink_delalloc(root, num_bytes, orig_bytes, 4177 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4185 state == FLUSH_DELALLOC_WAIT); 4178 state == FLUSH_DELALLOC_WAIT);
4186 break; 4179 break;
4187 case ALLOC_CHUNK: 4180 case ALLOC_CHUNK:
@@ -8938,3 +8931,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8938 range->len = trimmed; 8931 range->len = trimmed;
8939 return ret; 8932 return ret;
8940} 8933}
8934
8935/*
8936 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
8937 * they are used to prevent the some tasks writing data into the page cache
8938 * by nocow before the subvolume is snapshoted, but flush the data into
8939 * the disk after the snapshot creation.
8940 */
8941void btrfs_end_nocow_write(struct btrfs_root *root)
8942{
8943 percpu_counter_dec(&root->subv_writers->counter);
8944 /*
8945 * Make sure counter is updated before we wake up
8946 * waiters.
8947 */
8948 smp_mb();
8949 if (waitqueue_active(&root->subv_writers->wait))
8950 wake_up(&root->subv_writers->wait);
8951}
8952
8953int btrfs_start_nocow_write(struct btrfs_root *root)
8954{
8955 if (unlikely(atomic_read(&root->will_be_snapshoted)))
8956 return 0;
8957
8958 percpu_counter_inc(&root->subv_writers->counter);
8959 /*
8960 * Make sure counter is updated before we check for snapshot creation.
8961 */
8962 smp_mb();
8963 if (unlikely(atomic_read(&root->will_be_snapshoted))) {
8964 btrfs_end_nocow_write(root);
8965 return 0;
8966 }
8967 return 1;
8968}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..ae69a00387e7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
229 } 229 }
230} 230}
231 231
232static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 232static struct rb_node *tree_insert(struct rb_root *root,
233 struct rb_node *search_start,
234 u64 offset,
233 struct rb_node *node, 235 struct rb_node *node,
234 struct rb_node ***p_in, 236 struct rb_node ***p_in,
235 struct rb_node **parent_in) 237 struct rb_node **parent_in)
236{ 238{
237 struct rb_node **p = &root->rb_node; 239 struct rb_node **p;
238 struct rb_node *parent = NULL; 240 struct rb_node *parent = NULL;
239 struct tree_entry *entry; 241 struct tree_entry *entry;
240 242
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
244 goto do_insert; 246 goto do_insert;
245 } 247 }
246 248
249 p = search_start ? &search_start : &root->rb_node;
247 while (*p) { 250 while (*p) {
248 parent = *p; 251 parent = *p;
249 entry = rb_entry(parent, struct tree_entry, rb_node); 252 entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
430 433
431 set_state_bits(tree, state, bits); 434 set_state_bits(tree, state, bits);
432 435
433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent); 436 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
434 if (node) { 437 if (node) {
435 struct extent_state *found; 438 struct extent_state *found;
436 found = rb_entry(node, struct extent_state, rb_node); 439 found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
477 prealloc->state = orig->state; 480 prealloc->state = orig->state;
478 orig->start = split; 481 orig->start = split;
479 482
480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, 483 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
481 NULL, NULL); 484 &prealloc->rb_node, NULL, NULL);
482 if (node) { 485 if (node) {
483 free_extent_state(prealloc); 486 free_extent_state(prealloc);
484 return -EEXIST; 487 return -EEXIST;
@@ -2757,7 +2760,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2757 2760
2758 if (em_cached && *em_cached) { 2761 if (em_cached && *em_cached) {
2759 em = *em_cached; 2762 em = *em_cached;
2760 if (em->in_tree && start >= em->start && 2763 if (extent_map_in_tree(em) && start >= em->start &&
2761 start < extent_map_end(em)) { 2764 start < extent_map_end(em)) {
2762 atomic_inc(&em->refs); 2765 atomic_inc(&em->refs);
2763 return em; 2766 return em;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
52 if (!em) 52 if (!em)
53 return NULL; 53 return NULL;
54 em->in_tree = 0; 54 RB_CLEAR_NODE(&em->rb_node);
55 em->flags = 0; 55 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 56 em->compress_type = BTRFS_COMPRESS_NONE;
57 em->generation = 0; 57 em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
73 return; 73 return;
74 WARN_ON(atomic_read(&em->refs) == 0); 74 WARN_ON(atomic_read(&em->refs) == 0);
75 if (atomic_dec_and_test(&em->refs)) { 75 if (atomic_dec_and_test(&em->refs)) {
76 WARN_ON(em->in_tree); 76 WARN_ON(extent_map_in_tree(em));
77 WARN_ON(!list_empty(&em->list)); 77 WARN_ON(!list_empty(&em->list));
78 kmem_cache_free(extent_map_cache, em); 78 kmem_cache_free(extent_map_cache, em);
79 } 79 }
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
99 parent = *p; 99 parent = *p;
100 entry = rb_entry(parent, struct extent_map, rb_node); 100 entry = rb_entry(parent, struct extent_map, rb_node);
101 101
102 WARN_ON(!entry->in_tree);
103
104 if (em->start < entry->start) 102 if (em->start < entry->start)
105 p = &(*p)->rb_left; 103 p = &(*p)->rb_left;
106 else if (em->start >= extent_map_end(entry)) 104 else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
128 if (end > entry->start && em->start < extent_map_end(entry)) 126 if (end > entry->start && em->start < extent_map_end(entry))
129 return -EEXIST; 127 return -EEXIST;
130 128
131 em->in_tree = 1;
132 rb_link_node(&em->rb_node, orig_parent, p); 129 rb_link_node(&em->rb_node, orig_parent, p);
133 rb_insert_color(&em->rb_node, root); 130 rb_insert_color(&em->rb_node, root);
134 return 0; 131 return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
153 prev = n; 150 prev = n;
154 prev_entry = entry; 151 prev_entry = entry;
155 152
156 WARN_ON(!entry->in_tree);
157
158 if (offset < entry->start) 153 if (offset < entry->start)
159 n = n->rb_left; 154 n = n->rb_left;
160 else if (offset >= extent_map_end(entry)) 155 else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
240 em->len += merge->len; 235 em->len += merge->len;
241 em->block_len += merge->block_len; 236 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 237 em->block_start = merge->block_start;
243 merge->in_tree = 0;
244 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 238 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
245 em->mod_start = merge->mod_start; 239 em->mod_start = merge->mod_start;
246 em->generation = max(em->generation, merge->generation); 240 em->generation = max(em->generation, merge->generation);
247 241
248 rb_erase(&merge->rb_node, &tree->map); 242 rb_erase(&merge->rb_node, &tree->map);
243 RB_CLEAR_NODE(&merge->rb_node);
249 free_extent_map(merge); 244 free_extent_map(merge);
250 } 245 }
251 } 246 }
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
257 em->len += merge->len; 252 em->len += merge->len;
258 em->block_len += merge->block_len; 253 em->block_len += merge->block_len;
259 rb_erase(&merge->rb_node, &tree->map); 254 rb_erase(&merge->rb_node, &tree->map);
260 merge->in_tree = 0; 255 RB_CLEAR_NODE(&merge->rb_node);
261 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 256 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
262 em->generation = max(em->generation, merge->generation); 257 em->generation = max(em->generation, merge->generation);
263 free_extent_map(merge); 258 free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
319void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) 314void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
320{ 315{
321 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 316 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
322 if (em->in_tree) 317 if (extent_map_in_tree(em))
318 try_merge_map(tree, em);
319}
320
321static inline void setup_extent_mapping(struct extent_map_tree *tree,
322 struct extent_map *em,
323 int modified)
324{
325 atomic_inc(&em->refs);
326 em->mod_start = em->start;
327 em->mod_len = em->len;
328
329 if (modified)
330 list_move(&em->list, &tree->modified_extents);
331 else
323 try_merge_map(tree, em); 332 try_merge_map(tree, em);
324} 333}
325 334
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
342 if (ret) 351 if (ret)
343 goto out; 352 goto out;
344 353
345 atomic_inc(&em->refs); 354 setup_extent_mapping(tree, em, modified);
346
347 em->mod_start = em->start;
348 em->mod_len = em->len;
349
350 if (modified)
351 list_move(&em->list, &tree->modified_extents);
352 else
353 try_merge_map(tree, em);
354out: 355out:
355 return ret; 356 return ret;
356} 357}
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
434 rb_erase(&em->rb_node, &tree->map); 435 rb_erase(&em->rb_node, &tree->map);
435 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 436 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
436 list_del_init(&em->list); 437 list_del_init(&em->list);
437 em->in_tree = 0; 438 RB_CLEAR_NODE(&em->rb_node);
438 return ret; 439 return ret;
439} 440}
441
442void replace_extent_mapping(struct extent_map_tree *tree,
443 struct extent_map *cur,
444 struct extent_map *new,
445 int modified)
446{
447 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
448 ASSERT(extent_map_in_tree(cur));
449 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
450 list_del_init(&cur->list);
451 rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
452 RB_CLEAR_NODE(&cur->rb_node);
453
454 setup_extent_mapping(tree, new, modified);
455}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
33 unsigned long flags; 33 unsigned long flags;
34 struct block_device *bdev; 34 struct block_device *bdev;
35 atomic_t refs; 35 atomic_t refs;
36 unsigned int in_tree;
37 unsigned int compress_type; 36 unsigned int compress_type;
38 struct list_head list; 37 struct list_head list;
39}; 38};
@@ -44,6 +43,11 @@ struct extent_map_tree {
44 rwlock_t lock; 43 rwlock_t lock;
45}; 44};
46 45
46static inline int extent_map_in_tree(const struct extent_map *em)
47{
48 return !RB_EMPTY_NODE(&em->rb_node);
49}
50
47static inline u64 extent_map_end(struct extent_map *em) 51static inline u64 extent_map_end(struct extent_map *em)
48{ 52{
49 if (em->start + em->len < em->start) 53 if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
64int add_extent_mapping(struct extent_map_tree *tree, 68int add_extent_mapping(struct extent_map_tree *tree,
65 struct extent_map *em, int modified); 69 struct extent_map *em, int modified);
66int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); 70int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
71void replace_extent_mapping(struct extent_map_tree *tree,
72 struct extent_map *cur,
73 struct extent_map *new,
74 int modified);
67 75
68struct extent_map *alloc_extent_map(void); 76struct extent_map *alloc_extent_map(void);
69void free_extent_map(struct extent_map *em); 77void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..e1ffb1e22898 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
591 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 591 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
592 clear_bit(EXTENT_FLAG_LOGGING, &flags); 592 clear_bit(EXTENT_FLAG_LOGGING, &flags);
593 modified = !list_empty(&em->list); 593 modified = !list_empty(&em->list);
594 remove_extent_mapping(em_tree, em);
595 if (no_splits) 594 if (no_splits)
596 goto next; 595 goto next;
597 596
@@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
622 split->bdev = em->bdev; 621 split->bdev = em->bdev;
623 split->flags = flags; 622 split->flags = flags;
624 split->compress_type = em->compress_type; 623 split->compress_type = em->compress_type;
625 ret = add_extent_mapping(em_tree, split, modified); 624 replace_extent_mapping(em_tree, em, split, modified);
626 BUG_ON(ret); /* Logic error */
627 free_extent_map(split); 625 free_extent_map(split);
628 split = split2; 626 split = split2;
629 split2 = NULL; 627 split2 = NULL;
@@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
661 split->orig_block_len = 0; 659 split->orig_block_len = 0;
662 } 660 }
663 661
664 ret = add_extent_mapping(em_tree, split, modified); 662 if (extent_map_in_tree(em)) {
665 BUG_ON(ret); /* Logic error */ 663 replace_extent_mapping(em_tree, em, split,
664 modified);
665 } else {
666 ret = add_extent_mapping(em_tree, split,
667 modified);
668 ASSERT(ret == 0); /* Logic error */
669 }
666 free_extent_map(split); 670 free_extent_map(split);
667 split = NULL; 671 split = NULL;
668 } 672 }
669next: 673next:
674 if (extent_map_in_tree(em))
675 remove_extent_mapping(em_tree, em);
670 write_unlock(&em_tree->lock); 676 write_unlock(&em_tree->lock);
671 677
672 /* once for us */ 678 /* once for us */
@@ -720,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
720 if (drop_cache) 726 if (drop_cache)
721 btrfs_drop_extent_cache(inode, start, end - 1, 0); 727 btrfs_drop_extent_cache(inode, start, end - 1, 0);
722 728
723 if (start >= BTRFS_I(inode)->disk_i_size) 729 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
724 modify_tree = 0; 730 modify_tree = 0;
725 731
726 while (1) { 732 while (1) {
@@ -798,7 +804,10 @@ next_slot:
798 */ 804 */
799 if (start > key.offset && end < extent_end) { 805 if (start > key.offset && end < extent_end) {
800 BUG_ON(del_nr > 0); 806 BUG_ON(del_nr > 0);
801 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 807 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
808 ret = -EINVAL;
809 break;
810 }
802 811
803 memcpy(&new_key, &key, sizeof(new_key)); 812 memcpy(&new_key, &key, sizeof(new_key));
804 new_key.offset = start; 813 new_key.offset = start;
@@ -841,7 +850,10 @@ next_slot:
841 * | -------- extent -------- | 850 * | -------- extent -------- |
842 */ 851 */
843 if (start <= key.offset && end < extent_end) { 852 if (start <= key.offset && end < extent_end) {
844 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 853 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
854 ret = -EINVAL;
855 break;
856 }
845 857
846 memcpy(&new_key, &key, sizeof(new_key)); 858 memcpy(&new_key, &key, sizeof(new_key));
847 new_key.offset = end; 859 new_key.offset = end;
@@ -864,7 +876,10 @@ next_slot:
864 */ 876 */
865 if (start > key.offset && end >= extent_end) { 877 if (start > key.offset && end >= extent_end) {
866 BUG_ON(del_nr > 0); 878 BUG_ON(del_nr > 0);
867 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 879 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
880 ret = -EINVAL;
881 break;
882 }
868 883
869 btrfs_set_file_extent_num_bytes(leaf, fi, 884 btrfs_set_file_extent_num_bytes(leaf, fi,
870 start - key.offset); 885 start - key.offset);
@@ -938,34 +953,42 @@ next_slot:
938 * Set path->slots[0] to first slot, so that after the delete 953 * Set path->slots[0] to first slot, so that after the delete
939 * if items are move off from our leaf to its immediate left or 954 * if items are move off from our leaf to its immediate left or
940 * right neighbor leafs, we end up with a correct and adjusted 955 * right neighbor leafs, we end up with a correct and adjusted
941 * path->slots[0] for our insertion. 956 * path->slots[0] for our insertion (if replace_extent != 0).
942 */ 957 */
943 path->slots[0] = del_slot; 958 path->slots[0] = del_slot;
944 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 959 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
945 if (ret) 960 if (ret)
946 btrfs_abort_transaction(trans, root, ret); 961 btrfs_abort_transaction(trans, root, ret);
962 }
947 963
948 leaf = path->nodes[0]; 964 leaf = path->nodes[0];
949 /* 965 /*
950 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that 966 * If btrfs_del_items() was called, it might have deleted a leaf, in
951 * is, its contents got pushed to its neighbors), in which case 967 * which case it unlocked our path, so check path->locks[0] matches a
952 * it means path->locks[0] == 0 968 * write lock.
953 */ 969 */
954 if (!ret && replace_extent && leafs_visited == 1 && 970 if (!ret && replace_extent && leafs_visited == 1 &&
955 path->locks[0] && 971 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
956 btrfs_leaf_free_space(root, leaf) >= 972 path->locks[0] == BTRFS_WRITE_LOCK) &&
957 sizeof(struct btrfs_item) + extent_item_size) { 973 btrfs_leaf_free_space(root, leaf) >=
958 974 sizeof(struct btrfs_item) + extent_item_size) {
959 key.objectid = ino; 975
960 key.type = BTRFS_EXTENT_DATA_KEY; 976 key.objectid = ino;
961 key.offset = start; 977 key.type = BTRFS_EXTENT_DATA_KEY;
962 setup_items_for_insert(root, path, &key, 978 key.offset = start;
963 &extent_item_size, 979 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
964 extent_item_size, 980 struct btrfs_key slot_key;
965 sizeof(struct btrfs_item) + 981
966 extent_item_size, 1); 982 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
967 *key_inserted = 1; 983 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
984 path->slots[0]++;
968 } 985 }
986 setup_items_for_insert(root, path, &key,
987 &extent_item_size,
988 extent_item_size,
989 sizeof(struct btrfs_item) +
990 extent_item_size, 1);
991 *key_inserted = 1;
969 } 992 }
970 993
971 if (!replace_extent || !(*key_inserted)) 994 if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1369,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1346 struct btrfs_ordered_extent *ordered; 1369 struct btrfs_ordered_extent *ordered;
1347 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1370 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1348 start_pos, last_pos, 0, cached_state); 1371 start_pos, last_pos, 0, cached_state);
1349 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); 1372 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1373 last_pos - start_pos + 1);
1350 if (ordered && 1374 if (ordered &&
1351 ordered->file_offset + ordered->len > start_pos && 1375 ordered->file_offset + ordered->len > start_pos &&
1352 ordered->file_offset <= last_pos) { 1376 ordered->file_offset <= last_pos) {
1353 btrfs_put_ordered_extent(ordered);
1354 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1377 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1355 start_pos, last_pos, 1378 start_pos, last_pos,
1356 cached_state, GFP_NOFS); 1379 cached_state, GFP_NOFS);
@@ -1358,12 +1381,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1358 unlock_page(pages[i]); 1381 unlock_page(pages[i]);
1359 page_cache_release(pages[i]); 1382 page_cache_release(pages[i]);
1360 } 1383 }
1361 ret = btrfs_wait_ordered_range(inode, start_pos, 1384 btrfs_start_ordered_extent(inode, ordered, 1);
1362 last_pos - start_pos + 1); 1385 btrfs_put_ordered_extent(ordered);
1363 if (ret) 1386 return -EAGAIN;
1364 return ret;
1365 else
1366 return -EAGAIN;
1367 } 1387 }
1368 if (ordered) 1388 if (ordered)
1369 btrfs_put_ordered_extent(ordered); 1389 btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1416,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1396 u64 num_bytes; 1416 u64 num_bytes;
1397 int ret; 1417 int ret;
1398 1418
1419 ret = btrfs_start_nocow_write(root);
1420 if (!ret)
1421 return -ENOSPC;
1422
1399 lockstart = round_down(pos, root->sectorsize); 1423 lockstart = round_down(pos, root->sectorsize);
1400 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; 1424 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1401 1425
1402 while (1) { 1426 while (1) {
1403 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1427 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1439,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1415 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1439 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1416 if (ret <= 0) { 1440 if (ret <= 0) {
1417 ret = 0; 1441 ret = 0;
1442 btrfs_end_nocow_write(root);
1418 } else { 1443 } else {
1419 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1444 *write_bytes = min_t(size_t, *write_bytes ,
1420 EXTENT_DIRTY | EXTENT_DELALLOC | 1445 num_bytes - pos + lockstart);
1421 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1422 NULL, GFP_NOFS);
1423 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1424 } 1446 }
1425 1447
1426 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1448 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1532,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 if (!only_release_metadata) 1532 if (!only_release_metadata)
1511 btrfs_free_reserved_data_space(inode, 1533 btrfs_free_reserved_data_space(inode,
1512 reserve_bytes); 1534 reserve_bytes);
1535 else
1536 btrfs_end_nocow_write(root);
1513 break; 1537 break;
1514 } 1538 }
1515 1539
@@ -1598,6 +1622,9 @@ again:
1598 } 1622 }
1599 1623
1600 release_bytes = 0; 1624 release_bytes = 0;
1625 if (only_release_metadata)
1626 btrfs_end_nocow_write(root);
1627
1601 if (only_release_metadata && copied > 0) { 1628 if (only_release_metadata && copied > 0) {
1602 u64 lockstart = round_down(pos, root->sectorsize); 1629 u64 lockstart = round_down(pos, root->sectorsize);
1603 u64 lockend = lockstart + 1630 u64 lockend = lockstart +
@@ -1624,10 +1651,12 @@ again:
1624 kfree(pages); 1651 kfree(pages);
1625 1652
1626 if (release_bytes) { 1653 if (release_bytes) {
1627 if (only_release_metadata) 1654 if (only_release_metadata) {
1655 btrfs_end_nocow_write(root);
1628 btrfs_delalloc_release_metadata(inode, release_bytes); 1656 btrfs_delalloc_release_metadata(inode, release_bytes);
1629 else 1657 } else {
1630 btrfs_delalloc_release_space(inode, release_bytes); 1658 btrfs_delalloc_release_space(inode, release_bytes);
1659 }
1631 } 1660 }
1632 1661
1633 return num_written ? num_written : ret; 1662 return num_written ? num_written : ret;
@@ -1797,7 +1826,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1797 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1826 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1798 if (num_written > 0) { 1827 if (num_written > 0) {
1799 err = generic_write_sync(file, pos, num_written); 1828 err = generic_write_sync(file, pos, num_written);
1800 if (err < 0 && num_written > 0) 1829 if (err < 0)
1801 num_written = err; 1830 num_written = err;
1802 } 1831 }
1803 1832
@@ -1856,8 +1885,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1856 struct dentry *dentry = file->f_path.dentry; 1885 struct dentry *dentry = file->f_path.dentry;
1857 struct inode *inode = dentry->d_inode; 1886 struct inode *inode = dentry->d_inode;
1858 struct btrfs_root *root = BTRFS_I(inode)->root; 1887 struct btrfs_root *root = BTRFS_I(inode)->root;
1859 int ret = 0;
1860 struct btrfs_trans_handle *trans; 1888 struct btrfs_trans_handle *trans;
1889 struct btrfs_log_ctx ctx;
1890 int ret = 0;
1861 bool full_sync = 0; 1891 bool full_sync = 0;
1862 1892
1863 trace_btrfs_sync_file(file, datasync); 1893 trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1981,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1951 } 1981 }
1952 trans->sync = true; 1982 trans->sync = true;
1953 1983
1954 ret = btrfs_log_dentry_safe(trans, root, dentry); 1984 btrfs_init_log_ctx(&ctx);
1985
1986 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1955 if (ret < 0) { 1987 if (ret < 0) {
1956 /* Fallthrough and commit/free transaction. */ 1988 /* Fallthrough and commit/free transaction. */
1957 ret = 1; 1989 ret = 1;
@@ -1971,7 +2003,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1971 2003
1972 if (ret != BTRFS_NO_LOG_SYNC) { 2004 if (ret != BTRFS_NO_LOG_SYNC) {
1973 if (!ret) { 2005 if (!ret) {
1974 ret = btrfs_sync_log(trans, root); 2006 ret = btrfs_sync_log(trans, root, &ctx);
1975 if (!ret) { 2007 if (!ret) {
1976 ret = btrfs_end_transaction(trans, root); 2008 ret = btrfs_end_transaction(trans, root);
1977 goto out; 2009 goto out;
@@ -2157,6 +2189,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2157 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2189 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2158 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2190 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2159 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2191 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2192 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2160 2193
2161 ret = btrfs_wait_ordered_range(inode, offset, len); 2194 ret = btrfs_wait_ordered_range(inode, offset, len);
2162 if (ret) 2195 if (ret)
@@ -2172,14 +2205,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2172 * entire page. 2205 * entire page.
2173 */ 2206 */
2174 if (same_page && len < PAGE_CACHE_SIZE) { 2207 if (same_page && len < PAGE_CACHE_SIZE) {
2175 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 2208 if (offset < ino_size)
2176 ret = btrfs_truncate_page(inode, offset, len, 0); 2209 ret = btrfs_truncate_page(inode, offset, len, 0);
2177 mutex_unlock(&inode->i_mutex); 2210 mutex_unlock(&inode->i_mutex);
2178 return ret; 2211 return ret;
2179 } 2212 }
2180 2213
2181 /* zero back part of the first page */ 2214 /* zero back part of the first page */
2182 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2215 if (offset < ino_size) {
2183 ret = btrfs_truncate_page(inode, offset, 0, 0); 2216 ret = btrfs_truncate_page(inode, offset, 0, 0);
2184 if (ret) { 2217 if (ret) {
2185 mutex_unlock(&inode->i_mutex); 2218 mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2221,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2188 } 2221 }
2189 2222
2190 /* zero the front end of the last page */ 2223 /* zero the front end of the last page */
2191 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2224 if (offset + len < ino_size) {
2192 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2225 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2193 if (ret) { 2226 if (ret) {
2194 mutex_unlock(&inode->i_mutex); 2227 mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2310,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2277 2310
2278 trans->block_rsv = &root->fs_info->trans_block_rsv; 2311 trans->block_rsv = &root->fs_info->trans_block_rsv;
2279 2312
2280 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2313 if (cur_offset < ino_size) {
2281 if (ret) { 2314 ret = fill_holes(trans, inode, path, cur_offset,
2282 err = ret; 2315 drop_end);
2283 break; 2316 if (ret) {
2317 err = ret;
2318 break;
2319 }
2284 } 2320 }
2285 2321
2286 cur_offset = drop_end; 2322 cur_offset = drop_end;
@@ -2313,10 +2349,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2313 } 2349 }
2314 2350
2315 trans->block_rsv = &root->fs_info->trans_block_rsv; 2351 trans->block_rsv = &root->fs_info->trans_block_rsv;
2316 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2352 if (cur_offset < ino_size) {
2317 if (ret) { 2353 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2318 err = ret; 2354 if (ret) {
2319 goto out_trans; 2355 err = ret;
2356 goto out_trans;
2357 }
2320 } 2358 }
2321 2359
2322out_trans: 2360out_trans:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d44486290b..06e9a4152b14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode,
864 864
865 if (btrfs_is_free_space_inode(inode)) { 865 if (btrfs_is_free_space_inode(inode)) {
866 WARN_ON_ONCE(1); 866 WARN_ON_ONCE(1);
867 return -EINVAL; 867 ret = -EINVAL;
868 goto out_unlock;
868 } 869 }
869 870
870 num_bytes = ALIGN(end - start + 1, blocksize); 871 num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1075 async_cow->end = cur_end; 1076 async_cow->end = cur_end;
1076 INIT_LIST_HEAD(&async_cow->extents); 1077 INIT_LIST_HEAD(&async_cow->extents);
1077 1078
1078 async_cow->work.func = async_cow_start; 1079 btrfs_init_work(&async_cow->work, async_cow_start,
1079 async_cow->work.ordered_func = async_cow_submit; 1080 async_cow_submit, async_cow_free);
1080 async_cow->work.ordered_free = async_cow_free;
1081 async_cow->work.flags = 0;
1082 1081
1083 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1082 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1084 PAGE_CACHE_SHIFT; 1083 PAGE_CACHE_SHIFT;
1085 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1084 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1086 1085
1087 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1086 btrfs_queue_work(root->fs_info->delalloc_workers,
1088 &async_cow->work); 1087 &async_cow->work);
1089 1088
1090 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1089 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1091 wait_event(root->fs_info->async_submit_wait, 1090 wait_event(root->fs_info->async_submit_wait,
@@ -1843,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1843 1842
1844 SetPageChecked(page); 1843 SetPageChecked(page);
1845 page_cache_get(page); 1844 page_cache_get(page);
1846 fixup->work.func = btrfs_writepage_fixup_worker; 1845 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
1847 fixup->page = page; 1846 fixup->page = page;
1848 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1847 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1849 return -EBUSY; 1848 return -EBUSY;
1850} 1849}
1851 1850
@@ -2239,6 +2238,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2239 return PTR_ERR(root); 2238 return PTR_ERR(root);
2240 } 2239 }
2241 2240
2241 if (btrfs_root_readonly(root)) {
2242 srcu_read_unlock(&fs_info->subvol_srcu, index);
2243 return 0;
2244 }
2245
2242 /* step 2: get inode */ 2246 /* step 2: get inode */
2243 key.objectid = backref->inum; 2247 key.objectid = backref->inum;
2244 key.type = BTRFS_INODE_ITEM_KEY; 2248 key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2759 struct inode *inode = page->mapping->host; 2763 struct inode *inode = page->mapping->host;
2760 struct btrfs_root *root = BTRFS_I(inode)->root; 2764 struct btrfs_root *root = BTRFS_I(inode)->root;
2761 struct btrfs_ordered_extent *ordered_extent = NULL; 2765 struct btrfs_ordered_extent *ordered_extent = NULL;
2762 struct btrfs_workers *workers; 2766 struct btrfs_workqueue *workers;
2763 2767
2764 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2768 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2765 2769
@@ -2768,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2768 end - start + 1, uptodate)) 2772 end - start + 1, uptodate))
2769 return 0; 2773 return 0;
2770 2774
2771 ordered_extent->work.func = finish_ordered_fn; 2775 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2772 ordered_extent->work.flags = 0;
2773 2776
2774 if (btrfs_is_free_space_inode(inode)) 2777 if (btrfs_is_free_space_inode(inode))
2775 workers = &root->fs_info->endio_freespace_worker; 2778 workers = root->fs_info->endio_freespace_worker;
2776 else 2779 else
2777 workers = &root->fs_info->endio_write_workers; 2780 workers = root->fs_info->endio_write_workers;
2778 btrfs_queue_worker(workers, &ordered_extent->work); 2781 btrfs_queue_work(workers, &ordered_extent->work);
2779 2782
2780 return 0; 2783 return 0;
2781} 2784}
@@ -4593,7 +4596,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4593 struct rb_node *node; 4596 struct rb_node *node;
4594 4597
4595 ASSERT(inode->i_state & I_FREEING); 4598 ASSERT(inode->i_state & I_FREEING);
4596 truncate_inode_pages(&inode->i_data, 0); 4599 truncate_inode_pages_final(&inode->i_data);
4597 4600
4598 write_lock(&map_tree->lock); 4601 write_lock(&map_tree->lock);
4599 while (!RB_EMPTY_ROOT(&map_tree->map)) { 4602 while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4927,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
4924 struct inode *inode; 4927 struct inode *inode;
4925 u64 objectid = 0; 4928 u64 objectid = 0;
4926 4929
4927 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4930 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
4931 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4928 4932
4929 spin_lock(&root->inode_lock); 4933 spin_lock(&root->inode_lock);
4930again: 4934again:
@@ -5799,6 +5803,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5799 } 5803 }
5800out_unlock: 5804out_unlock:
5801 btrfs_end_transaction(trans, root); 5805 btrfs_end_transaction(trans, root);
5806 btrfs_balance_delayed_items(root);
5802 btrfs_btree_balance_dirty(root); 5807 btrfs_btree_balance_dirty(root);
5803 if (drop_inode) { 5808 if (drop_inode) {
5804 inode_dec_link_count(inode); 5809 inode_dec_link_count(inode);
@@ -5872,6 +5877,7 @@ out_unlock:
5872 inode_dec_link_count(inode); 5877 inode_dec_link_count(inode);
5873 iput(inode); 5878 iput(inode);
5874 } 5879 }
5880 btrfs_balance_delayed_items(root);
5875 btrfs_btree_balance_dirty(root); 5881 btrfs_btree_balance_dirty(root);
5876 return err; 5882 return err;
5877} 5883}
@@ -5930,6 +5936,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5930 } 5936 }
5931 5937
5932 btrfs_end_transaction(trans, root); 5938 btrfs_end_transaction(trans, root);
5939 btrfs_balance_delayed_items(root);
5933fail: 5940fail:
5934 if (drop_inode) { 5941 if (drop_inode) {
5935 inode_dec_link_count(inode); 5942 inode_dec_link_count(inode);
@@ -5996,6 +6003,7 @@ out_fail:
5996 btrfs_end_transaction(trans, root); 6003 btrfs_end_transaction(trans, root);
5997 if (drop_on_err) 6004 if (drop_on_err)
5998 iput(inode); 6005 iput(inode);
6006 btrfs_balance_delayed_items(root);
5999 btrfs_btree_balance_dirty(root); 6007 btrfs_btree_balance_dirty(root);
6000 return err; 6008 return err;
6001} 6009}
@@ -6550,6 +6558,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6550 int ret; 6558 int ret;
6551 struct extent_buffer *leaf; 6559 struct extent_buffer *leaf;
6552 struct btrfs_root *root = BTRFS_I(inode)->root; 6560 struct btrfs_root *root = BTRFS_I(inode)->root;
6561 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6553 struct btrfs_file_extent_item *fi; 6562 struct btrfs_file_extent_item *fi;
6554 struct btrfs_key key; 6563 struct btrfs_key key;
6555 u64 disk_bytenr; 6564 u64 disk_bytenr;
@@ -6626,6 +6635,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6626 6635
6627 if (btrfs_extent_readonly(root, disk_bytenr)) 6636 if (btrfs_extent_readonly(root, disk_bytenr))
6628 goto out; 6637 goto out;
6638
6639 num_bytes = min(offset + *len, extent_end) - offset;
6640 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6641 u64 range_end;
6642
6643 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6644 ret = test_range_bit(io_tree, offset, range_end,
6645 EXTENT_DELALLOC, 0, NULL);
6646 if (ret) {
6647 ret = -EAGAIN;
6648 goto out;
6649 }
6650 }
6651
6629 btrfs_release_path(path); 6652 btrfs_release_path(path);
6630 6653
6631 /* 6654 /*
@@ -6654,7 +6677,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6654 */ 6677 */
6655 disk_bytenr += backref_offset; 6678 disk_bytenr += backref_offset;
6656 disk_bytenr += offset - key.offset; 6679 disk_bytenr += offset - key.offset;
6657 num_bytes = min(offset + *len, extent_end) - offset;
6658 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6680 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6659 goto out; 6681 goto out;
6660 /* 6682 /*
@@ -7024,10 +7046,9 @@ again:
7024 if (!ret) 7046 if (!ret)
7025 goto out_test; 7047 goto out_test;
7026 7048
7027 ordered->work.func = finish_ordered_fn; 7049 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
7028 ordered->work.flags = 0; 7050 btrfs_queue_work(root->fs_info->endio_write_workers,
7029 btrfs_queue_worker(&root->fs_info->endio_write_workers, 7051 &ordered->work);
7030 &ordered->work);
7031out_test: 7052out_test:
7032 /* 7053 /*
7033 * our bio might span multiple ordered extents. If we haven't 7054 * our bio might span multiple ordered extents. If we haven't
@@ -7404,15 +7425,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7404 smp_mb__after_atomic_inc(); 7425 smp_mb__after_atomic_inc();
7405 7426
7406 /* 7427 /*
7407 * The generic stuff only does filemap_write_and_wait_range, which isn't 7428 * The generic stuff only does filemap_write_and_wait_range, which
7408 * enough if we've written compressed pages to this area, so we need to 7429 * isn't enough if we've written compressed pages to this area, so
7409 * call btrfs_wait_ordered_range to make absolutely sure that any 7430 * we need to flush the dirty pages again to make absolutely sure
7410 * outstanding dirty pages are on disk. 7431 * that any outstanding dirty pages are on disk.
7411 */ 7432 */
7412 count = iov_length(iov, nr_segs); 7433 count = iov_length(iov, nr_segs);
7413 ret = btrfs_wait_ordered_range(inode, offset, count); 7434 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7414 if (ret) 7435 &BTRFS_I(inode)->runtime_flags))
7415 return ret; 7436 filemap_fdatawrite_range(inode->i_mapping, offset, count);
7416 7437
7417 if (rw & WRITE) { 7438 if (rw & WRITE) {
7418 /* 7439 /*
@@ -8404,7 +8425,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8404 work->inode = inode; 8425 work->inode = inode;
8405 work->wait = wait; 8426 work->wait = wait;
8406 work->delay_iput = delay_iput; 8427 work->delay_iput = delay_iput;
8407 work->work.func = btrfs_run_delalloc_work; 8428 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
8408 8429
8409 return work; 8430 return work;
8410} 8431}
@@ -8419,7 +8440,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8419 * some fairly slow code that needs optimization. This walks the list 8440 * some fairly slow code that needs optimization. This walks the list
8420 * of all the inodes with pending delalloc and forces them to disk. 8441 * of all the inodes with pending delalloc and forces them to disk.
8421 */ 8442 */
8422static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8443static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8444 int nr)
8423{ 8445{
8424 struct btrfs_inode *binode; 8446 struct btrfs_inode *binode;
8425 struct inode *inode; 8447 struct inode *inode;
@@ -8431,6 +8453,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8431 INIT_LIST_HEAD(&works); 8453 INIT_LIST_HEAD(&works);
8432 INIT_LIST_HEAD(&splice); 8454 INIT_LIST_HEAD(&splice);
8433 8455
8456 mutex_lock(&root->delalloc_mutex);
8434 spin_lock(&root->delalloc_lock); 8457 spin_lock(&root->delalloc_lock);
8435 list_splice_init(&root->delalloc_inodes, &splice); 8458 list_splice_init(&root->delalloc_inodes, &splice);
8436 while (!list_empty(&splice)) { 8459 while (!list_empty(&splice)) {
@@ -8453,12 +8476,14 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8453 else 8476 else
8454 iput(inode); 8477 iput(inode);
8455 ret = -ENOMEM; 8478 ret = -ENOMEM;
8456 goto out; 8479 break;
8457 } 8480 }
8458 list_add_tail(&work->list, &works); 8481 list_add_tail(&work->list, &works);
8459 btrfs_queue_worker(&root->fs_info->flush_workers, 8482 btrfs_queue_work(root->fs_info->flush_workers,
8460 &work->work); 8483 &work->work);
8461 8484 ret++;
8485 if (nr != -1 && ret >= nr)
8486 break;
8462 cond_resched(); 8487 cond_resched();
8463 spin_lock(&root->delalloc_lock); 8488 spin_lock(&root->delalloc_lock);
8464 } 8489 }
@@ -8468,18 +8493,13 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8468 list_del_init(&work->list); 8493 list_del_init(&work->list);
8469 btrfs_wait_and_free_delalloc_work(work); 8494 btrfs_wait_and_free_delalloc_work(work);
8470 } 8495 }
8471 return 0;
8472out:
8473 list_for_each_entry_safe(work, next, &works, list) {
8474 list_del_init(&work->list);
8475 btrfs_wait_and_free_delalloc_work(work);
8476 }
8477 8496
8478 if (!list_empty_careful(&splice)) { 8497 if (!list_empty_careful(&splice)) {
8479 spin_lock(&root->delalloc_lock); 8498 spin_lock(&root->delalloc_lock);
8480 list_splice_tail(&splice, &root->delalloc_inodes); 8499 list_splice_tail(&splice, &root->delalloc_inodes);
8481 spin_unlock(&root->delalloc_lock); 8500 spin_unlock(&root->delalloc_lock);
8482 } 8501 }
8502 mutex_unlock(&root->delalloc_mutex);
8483 return ret; 8503 return ret;
8484} 8504}
8485 8505
@@ -8490,7 +8510,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8490 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 8510 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8491 return -EROFS; 8511 return -EROFS;
8492 8512
8493 ret = __start_delalloc_inodes(root, delay_iput); 8513 ret = __start_delalloc_inodes(root, delay_iput, -1);
8514 if (ret > 0)
8515 ret = 0;
8494 /* 8516 /*
8495 * the filemap_flush will queue IO into the worker threads, but 8517 * the filemap_flush will queue IO into the worker threads, but
8496 * we have to make sure the IO is actually started and that 8518 * we have to make sure the IO is actually started and that
@@ -8507,7 +8529,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8507 return ret; 8529 return ret;
8508} 8530}
8509 8531
8510int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) 8532int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8533 int nr)
8511{ 8534{
8512 struct btrfs_root *root; 8535 struct btrfs_root *root;
8513 struct list_head splice; 8536 struct list_head splice;
@@ -8518,9 +8541,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8518 8541
8519 INIT_LIST_HEAD(&splice); 8542 INIT_LIST_HEAD(&splice);
8520 8543
8544 mutex_lock(&fs_info->delalloc_root_mutex);
8521 spin_lock(&fs_info->delalloc_root_lock); 8545 spin_lock(&fs_info->delalloc_root_lock);
8522 list_splice_init(&fs_info->delalloc_roots, &splice); 8546 list_splice_init(&fs_info->delalloc_roots, &splice);
8523 while (!list_empty(&splice)) { 8547 while (!list_empty(&splice) && nr) {
8524 root = list_first_entry(&splice, struct btrfs_root, 8548 root = list_first_entry(&splice, struct btrfs_root,
8525 delalloc_root); 8549 delalloc_root);
8526 root = btrfs_grab_fs_root(root); 8550 root = btrfs_grab_fs_root(root);
@@ -8529,15 +8553,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8529 &fs_info->delalloc_roots); 8553 &fs_info->delalloc_roots);
8530 spin_unlock(&fs_info->delalloc_root_lock); 8554 spin_unlock(&fs_info->delalloc_root_lock);
8531 8555
8532 ret = __start_delalloc_inodes(root, delay_iput); 8556 ret = __start_delalloc_inodes(root, delay_iput, nr);
8533 btrfs_put_fs_root(root); 8557 btrfs_put_fs_root(root);
8534 if (ret) 8558 if (ret < 0)
8535 goto out; 8559 goto out;
8536 8560
8561 if (nr != -1) {
8562 nr -= ret;
8563 WARN_ON(nr < 0);
8564 }
8537 spin_lock(&fs_info->delalloc_root_lock); 8565 spin_lock(&fs_info->delalloc_root_lock);
8538 } 8566 }
8539 spin_unlock(&fs_info->delalloc_root_lock); 8567 spin_unlock(&fs_info->delalloc_root_lock);
8540 8568
8569 ret = 0;
8541 atomic_inc(&fs_info->async_submit_draining); 8570 atomic_inc(&fs_info->async_submit_draining);
8542 while (atomic_read(&fs_info->nr_async_submits) || 8571 while (atomic_read(&fs_info->nr_async_submits) ||
8543 atomic_read(&fs_info->async_delalloc_pages)) { 8572 atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8575,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8546 atomic_read(&fs_info->async_delalloc_pages) == 0)); 8575 atomic_read(&fs_info->async_delalloc_pages) == 0));
8547 } 8576 }
8548 atomic_dec(&fs_info->async_submit_draining); 8577 atomic_dec(&fs_info->async_submit_draining);
8549 return 0;
8550out: 8578out:
8551 if (!list_empty_careful(&splice)) { 8579 if (!list_empty_careful(&splice)) {
8552 spin_lock(&fs_info->delalloc_root_lock); 8580 spin_lock(&fs_info->delalloc_root_lock);
8553 list_splice_tail(&splice, &fs_info->delalloc_roots); 8581 list_splice_tail(&splice, &fs_info->delalloc_roots);
8554 spin_unlock(&fs_info->delalloc_root_lock); 8582 spin_unlock(&fs_info->delalloc_root_lock);
8555 } 8583 }
8584 mutex_unlock(&fs_info->delalloc_root_mutex);
8556 return ret; 8585 return ret;
8557} 8586}
8558 8587
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..0401397b5c92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61 61
62#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
64 * structures are incorrect, as the timespec structure from userspace
65 * is 4 bytes too small. We define these alternatives here to teach
66 * the kernel about the 32-bit struct packing.
67 */
68struct btrfs_ioctl_timespec_32 {
69 __u64 sec;
70 __u32 nsec;
71} __attribute__ ((__packed__));
72
73struct btrfs_ioctl_received_subvol_args_32 {
74 char uuid[BTRFS_UUID_SIZE]; /* in */
75 __u64 stransid; /* in */
76 __u64 rtransid; /* out */
77 struct btrfs_ioctl_timespec_32 stime; /* in */
78 struct btrfs_ioctl_timespec_32 rtime; /* out */
79 __u64 flags; /* in */
80 __u64 reserved[16]; /* in */
81} __attribute__ ((__packed__));
82
83#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
84 struct btrfs_ioctl_received_subvol_args_32)
85#endif
86
87
62static int btrfs_clone(struct inode *src, struct inode *inode, 88static int btrfs_clone(struct inode *src, struct inode *inode,
63 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 89 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
64 90
@@ -585,6 +611,23 @@ fail:
585 return ret; 611 return ret;
586} 612}
587 613
614static void btrfs_wait_nocow_write(struct btrfs_root *root)
615{
616 s64 writers;
617 DEFINE_WAIT(wait);
618
619 do {
620 prepare_to_wait(&root->subv_writers->wait, &wait,
621 TASK_UNINTERRUPTIBLE);
622
623 writers = percpu_counter_sum(&root->subv_writers->counter);
624 if (writers)
625 schedule();
626
627 finish_wait(&root->subv_writers->wait, &wait);
628 } while (writers);
629}
630
588static int create_snapshot(struct btrfs_root *root, struct inode *dir, 631static int create_snapshot(struct btrfs_root *root, struct inode *dir,
589 struct dentry *dentry, char *name, int namelen, 632 struct dentry *dentry, char *name, int namelen,
590 u64 *async_transid, bool readonly, 633 u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
598 if (!root->ref_cows) 641 if (!root->ref_cows)
599 return -EINVAL; 642 return -EINVAL;
600 643
644 atomic_inc(&root->will_be_snapshoted);
645 smp_mb__after_atomic_inc();
646 btrfs_wait_nocow_write(root);
647
601 ret = btrfs_start_delalloc_inodes(root, 0); 648 ret = btrfs_start_delalloc_inodes(root, 0);
602 if (ret) 649 if (ret)
603 return ret; 650 goto out;
604 651
605 btrfs_wait_ordered_extents(root, -1); 652 btrfs_wait_ordered_extents(root, -1);
606 653
607 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 654 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
608 if (!pending_snapshot) 655 if (!pending_snapshot) {
609 return -ENOMEM; 656 ret = -ENOMEM;
657 goto out;
658 }
610 659
611 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 660 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
612 BTRFS_BLOCK_RSV_TEMP); 661 BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
623 &pending_snapshot->qgroup_reserved, 672 &pending_snapshot->qgroup_reserved,
624 false); 673 false);
625 if (ret) 674 if (ret)
626 goto out; 675 goto free;
627 676
628 pending_snapshot->dentry = dentry; 677 pending_snapshot->dentry = dentry;
629 pending_snapshot->root = root; 678 pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
674 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 723 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
675 &pending_snapshot->block_rsv, 724 &pending_snapshot->block_rsv,
676 pending_snapshot->qgroup_reserved); 725 pending_snapshot->qgroup_reserved);
677out: 726free:
678 kfree(pending_snapshot); 727 kfree(pending_snapshot);
728out:
729 atomic_dec(&root->will_be_snapshoted);
679 return ret; 730 return ret;
680} 731}
681 732
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
884 min_key.type = BTRFS_EXTENT_DATA_KEY; 935 min_key.type = BTRFS_EXTENT_DATA_KEY;
885 min_key.offset = *off; 936 min_key.offset = *off;
886 937
887 path->keep_locks = 1;
888
889 while (1) { 938 while (1) {
939 path->keep_locks = 1;
890 ret = btrfs_search_forward(root, &min_key, path, newer_than); 940 ret = btrfs_search_forward(root, &min_key, path, newer_than);
891 if (ret != 0) 941 if (ret != 0)
892 goto none; 942 goto none;
943 path->keep_locks = 0;
944 btrfs_unlock_up_safe(path, 1);
945process_slot:
893 if (min_key.objectid != ino) 946 if (min_key.objectid != ino)
894 goto none; 947 goto none;
895 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 948 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
908 return 0; 961 return 0;
909 } 962 }
910 963
964 path->slots[0]++;
965 if (path->slots[0] < btrfs_header_nritems(leaf)) {
966 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
967 goto process_slot;
968 }
969
911 if (min_key.offset == (u64)-1) 970 if (min_key.offset == (u64)-1)
912 goto none; 971 goto none;
913 972
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
935 read_unlock(&em_tree->lock); 994 read_unlock(&em_tree->lock);
936 995
937 if (!em) { 996 if (!em) {
997 struct extent_state *cached = NULL;
998 u64 end = start + len - 1;
999
938 /* get the big lock and read metadata off disk */ 1000 /* get the big lock and read metadata off disk */
939 lock_extent(io_tree, start, start + len - 1); 1001 lock_extent_bits(io_tree, start, end, 0, &cached);
940 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1002 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
941 unlock_extent(io_tree, start, start + len - 1); 1003 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
942 1004
943 if (IS_ERR(em)) 1005 if (IS_ERR(em))
944 return NULL; 1006 return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
957 return false; 1019 return false;
958 1020
959 next = defrag_lookup_extent(inode, em->start + em->len); 1021 next = defrag_lookup_extent(inode, em->start + em->len);
960 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
1023 (em->block_start + em->block_len == next->block_start))
961 ret = false; 1024 ret = false;
962 1025
963 free_extent_map(next); 1026 free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
1076 page_start = page_offset(page); 1139 page_start = page_offset(page);
1077 page_end = page_start + PAGE_CACHE_SIZE - 1; 1140 page_end = page_start + PAGE_CACHE_SIZE - 1;
1078 while (1) { 1141 while (1) {
1079 lock_extent(tree, page_start, page_end); 1142 lock_extent_bits(tree, page_start, page_end,
1143 0, &cached_state);
1080 ordered = btrfs_lookup_ordered_extent(inode, 1144 ordered = btrfs_lookup_ordered_extent(inode,
1081 page_start); 1145 page_start);
1082 unlock_extent(tree, page_start, page_end); 1146 unlock_extent_cached(tree, page_start, page_end,
1147 &cached_state, GFP_NOFS);
1083 if (!ordered) 1148 if (!ordered)
1084 break; 1149 break;
1085 1150
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1356 } 1421 }
1357 } 1422 }
1358 1423
1359 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1424 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
1360 filemap_flush(inode->i_mapping); 1425 filemap_flush(inode->i_mapping);
1426 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1427 &BTRFS_I(inode)->runtime_flags))
1428 filemap_flush(inode->i_mapping);
1429 }
1361 1430
1362 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1431 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1363 /* the filemap_flush will queue IO into the worker threads, but 1432 /* the filemap_flush will queue IO into the worker threads, but
@@ -1573,7 +1642,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1573 if (src_inode->i_sb != file_inode(file)->i_sb) { 1642 if (src_inode->i_sb != file_inode(file)->i_sb) {
1574 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1643 btrfs_info(BTRFS_I(src_inode)->root->fs_info,
1575 "Snapshot src from another FS"); 1644 "Snapshot src from another FS");
1576 ret = -EINVAL; 1645 ret = -EXDEV;
1577 } else if (!inode_owner_or_capable(src_inode)) { 1646 } else if (!inode_owner_or_capable(src_inode)) {
1578 /* 1647 /*
1579 * Subvolume creation is not restricted, but snapshots 1648 * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1866,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
1797 if (di && !IS_ERR(di)) { 1866 if (di && !IS_ERR(di)) {
1798 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1867 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1799 if (key.objectid == root->root_key.objectid) { 1868 if (key.objectid == root->root_key.objectid) {
1800 ret = -ENOTEMPTY; 1869 ret = -EPERM;
1870 btrfs_err(root->fs_info, "deleting default subvolume "
1871 "%llu is not allowed", key.objectid);
1801 goto out; 1872 goto out;
1802 } 1873 }
1803 btrfs_release_path(path); 1874 btrfs_release_path(path);
@@ -2994,8 +3065,9 @@ process_slot:
2994 new_key.offset + datal, 3065 new_key.offset + datal,
2995 1); 3066 1);
2996 if (ret) { 3067 if (ret) {
2997 btrfs_abort_transaction(trans, root, 3068 if (ret != -EINVAL)
2998 ret); 3069 btrfs_abort_transaction(trans,
3070 root, ret);
2999 btrfs_end_transaction(trans, root); 3071 btrfs_end_transaction(trans, root);
3000 goto out; 3072 goto out;
3001 } 3073 }
@@ -3153,8 +3225,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3153 * decompress into destination's address_space (the file offset 3225 * decompress into destination's address_space (the file offset
3154 * may change, so source mapping won't do), then recompress (or 3226 * may change, so source mapping won't do), then recompress (or
3155 * otherwise reinsert) a subrange. 3227 * otherwise reinsert) a subrange.
3156 * - allow ranges within the same file to be cloned (provided 3228 *
3157 * they don't overlap)? 3229 * - split destination inode's inline extents. The inline extents can
3230 * be either compressed or non-compressed.
3158 */ 3231 */
3159 3232
3160 /* the destination must be opened for writing */ 3233 /* the destination must be opened for writing */
@@ -4353,10 +4426,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
4353 return btrfs_qgroup_wait_for_completion(root->fs_info); 4426 return btrfs_qgroup_wait_for_completion(root->fs_info);
4354} 4427}
4355 4428
4356static long btrfs_ioctl_set_received_subvol(struct file *file, 4429static long _btrfs_ioctl_set_received_subvol(struct file *file,
4357 void __user *arg) 4430 struct btrfs_ioctl_received_subvol_args *sa)
4358{ 4431{
4359 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4360 struct inode *inode = file_inode(file); 4432 struct inode *inode = file_inode(file);
4361 struct btrfs_root *root = BTRFS_I(inode)->root; 4433 struct btrfs_root *root = BTRFS_I(inode)->root;
4362 struct btrfs_root_item *root_item = &root->root_item; 4434 struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4456,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4384 goto out; 4456 goto out;
4385 } 4457 }
4386 4458
4387 sa = memdup_user(arg, sizeof(*sa));
4388 if (IS_ERR(sa)) {
4389 ret = PTR_ERR(sa);
4390 sa = NULL;
4391 goto out;
4392 }
4393
4394 /* 4459 /*
4395 * 1 - root item 4460 * 1 - root item
4396 * 2 - uuid items (received uuid + subvol uuid) 4461 * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4509,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4444 goto out; 4509 goto out;
4445 } 4510 }
4446 4511
4512out:
4513 up_write(&root->fs_info->subvol_sem);
4514 mnt_drop_write_file(file);
4515 return ret;
4516}
4517
4518#ifdef CONFIG_64BIT
4519static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4520 void __user *arg)
4521{
4522 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4523 struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4524 int ret = 0;
4525
4526 args32 = memdup_user(arg, sizeof(*args32));
4527 if (IS_ERR(args32)) {
4528 ret = PTR_ERR(args32);
4529 args32 = NULL;
4530 goto out;
4531 }
4532
4533 args64 = kmalloc(sizeof(*args64), GFP_NOFS);
4534 if (IS_ERR(args64)) {
4535 ret = PTR_ERR(args64);
4536 args64 = NULL;
4537 goto out;
4538 }
4539
4540 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4541 args64->stransid = args32->stransid;
4542 args64->rtransid = args32->rtransid;
4543 args64->stime.sec = args32->stime.sec;
4544 args64->stime.nsec = args32->stime.nsec;
4545 args64->rtime.sec = args32->rtime.sec;
4546 args64->rtime.nsec = args32->rtime.nsec;
4547 args64->flags = args32->flags;
4548
4549 ret = _btrfs_ioctl_set_received_subvol(file, args64);
4550 if (ret)
4551 goto out;
4552
4553 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4554 args32->stransid = args64->stransid;
4555 args32->rtransid = args64->rtransid;
4556 args32->stime.sec = args64->stime.sec;
4557 args32->stime.nsec = args64->stime.nsec;
4558 args32->rtime.sec = args64->rtime.sec;
4559 args32->rtime.nsec = args64->rtime.nsec;
4560 args32->flags = args64->flags;
4561
4562 ret = copy_to_user(arg, args32, sizeof(*args32));
4563 if (ret)
4564 ret = -EFAULT;
4565
4566out:
4567 kfree(args32);
4568 kfree(args64);
4569 return ret;
4570}
4571#endif
4572
4573static long btrfs_ioctl_set_received_subvol(struct file *file,
4574 void __user *arg)
4575{
4576 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4577 int ret = 0;
4578
4579 sa = memdup_user(arg, sizeof(*sa));
4580 if (IS_ERR(sa)) {
4581 ret = PTR_ERR(sa);
4582 sa = NULL;
4583 goto out;
4584 }
4585
4586 ret = _btrfs_ioctl_set_received_subvol(file, sa);
4587
4588 if (ret)
4589 goto out;
4590
4447 ret = copy_to_user(arg, sa, sizeof(*sa)); 4591 ret = copy_to_user(arg, sa, sizeof(*sa));
4448 if (ret) 4592 if (ret)
4449 ret = -EFAULT; 4593 ret = -EFAULT;
4450 4594
4451out: 4595out:
4452 kfree(sa); 4596 kfree(sa);
4453 up_write(&root->fs_info->subvol_sem);
4454 mnt_drop_write_file(file);
4455 return ret; 4597 return ret;
4456} 4598}
4457 4599
@@ -4746,7 +4888,7 @@ long btrfs_ioctl(struct file *file, unsigned int
4746 case BTRFS_IOC_SYNC: { 4888 case BTRFS_IOC_SYNC: {
4747 int ret; 4889 int ret;
4748 4890
4749 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 4891 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
4750 if (ret) 4892 if (ret)
4751 return ret; 4893 return ret;
4752 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 4894 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4912,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4770 return btrfs_ioctl_balance_progress(root, argp); 4912 return btrfs_ioctl_balance_progress(root, argp);
4771 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 4913 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
4772 return btrfs_ioctl_set_received_subvol(file, argp); 4914 return btrfs_ioctl_set_received_subvol(file, argp);
4915#ifdef CONFIG_64BIT
4916 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
4917 return btrfs_ioctl_set_received_subvol_32(file, argp);
4918#endif
4773 case BTRFS_IOC_SEND: 4919 case BTRFS_IOC_SEND:
4774 return btrfs_ioctl_send(file, argp); 4920 return btrfs_ioctl_send(file, argp);
4775 case BTRFS_IOC_GET_DEV_STATS: 4921 case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
349 if (!uptodate) 349 if (!uptodate)
350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
351 351
352 if (entry->bytes_left == 0) 352 if (entry->bytes_left == 0) {
353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 else 354 if (waitqueue_active(&entry->wait))
355 wake_up(&entry->wait);
356 } else {
355 ret = 1; 357 ret = 1;
358 }
356out: 359out:
357 if (!ret && cached && entry) { 360 if (!ret && cached && entry) {
358 *cached = entry; 361 *cached = entry;
@@ -410,10 +413,13 @@ have_entry:
410 if (!uptodate) 413 if (!uptodate)
411 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 414 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
412 415
413 if (entry->bytes_left == 0) 416 if (entry->bytes_left == 0) {
414 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 417 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 else 418 if (waitqueue_active(&entry->wait))
419 wake_up(&entry->wait);
420 } else {
416 ret = 1; 421 ret = 1;
422 }
417out: 423out:
418 if (!ret && cached && entry) { 424 if (!ret && cached && entry) {
419 *cached = entry; 425 *cached = entry;
@@ -424,27 +430,48 @@ out:
424} 430}
425 431
426/* Needs to either be called under a log transaction or the log_mutex */ 432/* Needs to either be called under a log transaction or the log_mutex */
427void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) 433void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list)
428{ 435{
429 struct btrfs_ordered_inode_tree *tree; 436 struct btrfs_ordered_inode_tree *tree;
430 struct btrfs_ordered_extent *ordered; 437 struct btrfs_ordered_extent *ordered;
431 struct rb_node *n; 438 struct rb_node *n;
432 int index = log->log_transid % 2;
433 439
434 tree = &BTRFS_I(inode)->ordered_tree; 440 tree = &BTRFS_I(inode)->ordered_tree;
435 spin_lock_irq(&tree->lock); 441 spin_lock_irq(&tree->lock);
436 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
437 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
438 spin_lock(&log->log_extents_lock[index]); 444 if (!list_empty(&ordered->log_list))
439 if (list_empty(&ordered->log_list)) { 445 continue;
440 list_add_tail(&ordered->log_list, &log->logged_list[index]); 446 list_add_tail(&ordered->log_list, logged_list);
441 atomic_inc(&ordered->refs); 447 atomic_inc(&ordered->refs);
442 }
443 spin_unlock(&log->log_extents_lock[index]);
444 } 448 }
445 spin_unlock_irq(&tree->lock); 449 spin_unlock_irq(&tree->lock);
446} 450}
447 451
452void btrfs_put_logged_extents(struct list_head *logged_list)
453{
454 struct btrfs_ordered_extent *ordered;
455
456 while (!list_empty(logged_list)) {
457 ordered = list_first_entry(logged_list,
458 struct btrfs_ordered_extent,
459 log_list);
460 list_del_init(&ordered->log_list);
461 btrfs_put_ordered_extent(ordered);
462 }
463}
464
465void btrfs_submit_logged_extents(struct list_head *logged_list,
466 struct btrfs_root *log)
467{
468 int index = log->log_transid % 2;
469
470 spin_lock_irq(&log->log_extents_lock[index]);
471 list_splice_tail(logged_list, &log->logged_list[index]);
472 spin_unlock_irq(&log->log_extents_lock[index]);
473}
474
448void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
449{ 476{
450 struct btrfs_ordered_extent *ordered; 477 struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
577 INIT_LIST_HEAD(&splice); 604 INIT_LIST_HEAD(&splice);
578 INIT_LIST_HEAD(&works); 605 INIT_LIST_HEAD(&works);
579 606
580 mutex_lock(&root->fs_info->ordered_operations_mutex); 607 mutex_lock(&root->ordered_extent_mutex);
581 spin_lock(&root->ordered_extent_lock); 608 spin_lock(&root->ordered_extent_lock);
582 list_splice_init(&root->ordered_extents, &splice); 609 list_splice_init(&root->ordered_extents, &splice);
583 while (!list_empty(&splice) && nr) { 610 while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
588 atomic_inc(&ordered->refs); 615 atomic_inc(&ordered->refs);
589 spin_unlock(&root->ordered_extent_lock); 616 spin_unlock(&root->ordered_extent_lock);
590 617
591 ordered->flush_work.func = btrfs_run_ordered_extent_work; 618 btrfs_init_work(&ordered->flush_work,
619 btrfs_run_ordered_extent_work, NULL, NULL);
592 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
593 btrfs_queue_worker(&root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
594 &ordered->flush_work); 622 &ordered->flush_work);
595 623
596 cond_resched(); 624 cond_resched();
597 spin_lock(&root->ordered_extent_lock); 625 spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
608 btrfs_put_ordered_extent(ordered); 636 btrfs_put_ordered_extent(ordered);
609 cond_resched(); 637 cond_resched();
610 } 638 }
611 mutex_unlock(&root->fs_info->ordered_operations_mutex); 639 mutex_unlock(&root->ordered_extent_mutex);
612 640
613 return count; 641 return count;
614} 642}
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
621 649
622 INIT_LIST_HEAD(&splice); 650 INIT_LIST_HEAD(&splice);
623 651
652 mutex_lock(&fs_info->ordered_operations_mutex);
624 spin_lock(&fs_info->ordered_root_lock); 653 spin_lock(&fs_info->ordered_root_lock);
625 list_splice_init(&fs_info->ordered_roots, &splice); 654 list_splice_init(&fs_info->ordered_roots, &splice);
626 while (!list_empty(&splice) && nr) { 655 while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
643 } 672 }
644 list_splice_tail(&splice, &fs_info->ordered_roots); 673 list_splice_tail(&splice, &fs_info->ordered_roots);
645 spin_unlock(&fs_info->ordered_root_lock); 674 spin_unlock(&fs_info->ordered_root_lock);
675 mutex_unlock(&fs_info->ordered_operations_mutex);
646} 676}
647 677
648/* 678/*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
704 goto out; 734 goto out;
705 } 735 }
706 list_add_tail(&work->list, &works); 736 list_add_tail(&work->list, &works);
707 btrfs_queue_worker(&root->fs_info->flush_workers, 737 btrfs_queue_work(root->fs_info->flush_workers,
708 &work->work); 738 &work->work);
709 739
710 cond_resched(); 740 cond_resched();
711 spin_lock(&root->fs_info->ordered_root_lock); 741 spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
197 struct inode *inode); 197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 200void btrfs_get_logged_extents(struct inode *inode,
201 struct list_head *logged_list);
202void btrfs_put_logged_extents(struct list_head *logged_list);
203void btrfs_submit_logged_extents(struct list_head *logged_list,
204 struct btrfs_root *log);
201void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 205void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
202void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 206void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
203int __init ordered_data_init(void); 207int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1509 ret = qgroup_rescan_init(fs_info, 0, 1); 1509 ret = qgroup_rescan_init(fs_info, 0, 1);
1510 if (!ret) { 1510 if (!ret) {
1511 qgroup_rescan_zero_tracking(fs_info); 1511 qgroup_rescan_zero_tracking(fs_info);
1512 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 1512 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1513 &fs_info->qgroup_rescan_work); 1513 &fs_info->qgroup_rescan_work);
1514 } 1514 }
1515 ret = 0; 1515 ret = 0;
1516 } 1516 }
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2095 2095
2096 memset(&fs_info->qgroup_rescan_work, 0, 2096 memset(&fs_info->qgroup_rescan_work, 0,
2097 sizeof(fs_info->qgroup_rescan_work)); 2097 sizeof(fs_info->qgroup_rescan_work));
2098 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; 2098 btrfs_init_work(&fs_info->qgroup_rescan_work,
2099 btrfs_qgroup_rescan_worker, NULL, NULL);
2099 2100
2100 if (ret) { 2101 if (ret) {
2101err: 2102err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2158 2159
2159 qgroup_rescan_zero_tracking(fs_info); 2160 qgroup_rescan_zero_tracking(fs_info);
2160 2161
2161 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2162 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2162 &fs_info->qgroup_rescan_work); 2163 &fs_info->qgroup_rescan_work);
2163 2164
2164 return 0; 2165 return 0;
2165} 2166}
@@ -2190,6 +2191,6 @@ void
2190btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 2191btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2191{ 2192{
2192 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2193 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2193 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2194 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2194 &fs_info->qgroup_rescan_work); 2195 &fs_info->qgroup_rescan_work);
2195} 2196}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 rbio->work.flags = 0; 1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
1420 rbio->work.func = rmw_work;
1421 1420
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1421 btrfs_queue_work(rbio->fs_info->rmw_workers,
1423 &rbio->work); 1422 &rbio->work);
1424} 1423}
1425 1424
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1425static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{ 1426{
1428 rbio->work.flags = 0; 1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
1429 rbio->work.func = read_rebuild_work;
1430 1428
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1429 btrfs_queue_work(rbio->fs_info->rmw_workers,
1432 &rbio->work); 1430 &rbio->work);
1433} 1431}
1434 1432
1435/* 1433/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1667 plug = container_of(cb, struct btrfs_plug_cb, cb); 1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1668 1666
1669 if (from_schedule) { 1667 if (from_schedule) {
1670 plug->work.flags = 0; 1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1671 plug->work.func = unplug_work; 1669 btrfs_queue_work(plug->info->rmw_workers,
1672 btrfs_queue_worker(&plug->info->rmw_workers, 1670 &plug->work);
1673 &plug->work);
1674 return; 1671 return;
1675 } 1672 }
1676 run_plug(plug); 1673 run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
793 /* FIXME we cannot handle this properly right now */ 793 /* FIXME we cannot handle this properly right now */
794 BUG(); 794 BUG();
795 } 795 }
796 rmw->work.func = reada_start_machine_worker; 796 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
797 rmw->fs_info = fs_info; 797 rmw->fs_info = fs_info;
798 798
799 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); 799 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
800} 800}
801 801
802#ifdef DEBUG 802#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..def428a25b2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4249 rc->block_group->key.objectid, rc->block_group->flags); 4249 rc->block_group->key.objectid, rc->block_group->flags);
4250 4250
4251 ret = btrfs_start_delalloc_roots(fs_info, 0); 4251 ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4252 if (ret < 0) { 4252 if (ret < 0) {
4253 err = ret; 4253 err = ret;
4254 goto out; 4254 goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/err.h>
19#include <linux/uuid.h> 20#include <linux/uuid.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
271 key.offset++; 272 key.offset++;
272 273
273 root = btrfs_read_fs_root(tree_root, &root_key); 274 root = btrfs_read_fs_root(tree_root, &root_key);
274 err = PTR_RET(root); 275 err = PTR_ERR_OR_ZERO(root);
275 if (err && err != -ENOENT) { 276 if (err && err != -ENOENT) {
276 break; 277 break;
277 } else if (err == -ENOENT) { 278 } else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..93e6d7172844 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
315 atomic_inc(&fs_info->scrubs_running); 315 atomic_inc(&fs_info->scrubs_running);
316 atomic_inc(&fs_info->scrubs_paused); 316 atomic_inc(&fs_info->scrubs_paused);
317 mutex_unlock(&fs_info->scrub_lock); 317 mutex_unlock(&fs_info->scrub_lock);
318
319 /*
320 * check if @scrubs_running=@scrubs_paused condition
321 * inside wait_event() is not an atomic operation.
322 * which means we may inc/dec @scrub_running/paused
323 * at any time. Let's wake up @scrub_pause_wait as
324 * much as we can to let commit transaction blocked less.
325 */
326 wake_up(&fs_info->scrub_pause_wait);
327
318 atomic_inc(&sctx->workers_pending); 328 atomic_inc(&sctx->workers_pending);
319} 329}
320 330
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
418 sbio->index = i; 428 sbio->index = i;
419 sbio->sctx = sctx; 429 sbio->sctx = sctx;
420 sbio->page_count = 0; 430 sbio->page_count = 0;
421 sbio->work.func = scrub_bio_end_io_worker; 431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
432 NULL, NULL);
422 433
423 if (i != SCRUB_BIOS_PER_SCTX - 1) 434 if (i != SCRUB_BIOS_PER_SCTX - 1)
424 sctx->bios[i]->next_free = i + 1; 435 sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
987 fixup_nodatasum->root = fs_info->extent_root; 998 fixup_nodatasum->root = fs_info->extent_root;
988 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 999 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
989 scrub_pending_trans_workers_inc(sctx); 1000 scrub_pending_trans_workers_inc(sctx);
990 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 1001 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
991 btrfs_queue_worker(&fs_info->scrub_workers, 1002 NULL, NULL);
992 &fixup_nodatasum->work); 1003 btrfs_queue_work(fs_info->scrub_workers,
1004 &fixup_nodatasum->work);
993 goto out; 1005 goto out;
994 } 1006 }
995 1007
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1603 sbio->err = err; 1615 sbio->err = err;
1604 sbio->bio = bio; 1616 sbio->bio = bio;
1605 1617
1606 sbio->work.func = scrub_wr_bio_end_io_worker; 1618 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1607 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1619 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1608} 1620}
1609 1621
1610static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1622static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
2072 sbio->err = err; 2084 sbio->err = err;
2073 sbio->bio = bio; 2085 sbio->bio = bio;
2074 2086
2075 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 2087 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2076} 2088}
2077 2089
2078static void scrub_bio_end_io_worker(struct btrfs_work *work) 2090static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2686,10 +2698,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2686 2698
2687 wait_event(sctx->list_wait, 2699 wait_event(sctx->list_wait,
2688 atomic_read(&sctx->bios_in_flight) == 0); 2700 atomic_read(&sctx->bios_in_flight) == 0);
2689 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2701 atomic_inc(&fs_info->scrubs_paused);
2702 wake_up(&fs_info->scrub_pause_wait);
2703
2704 /*
2705 * must be called before we decrease @scrub_paused.
2706 * make sure we don't block transaction commit while
2707 * we are waiting pending workers finished.
2708 */
2690 wait_event(sctx->list_wait, 2709 wait_event(sctx->list_wait,
2691 atomic_read(&sctx->workers_pending) == 0); 2710 atomic_read(&sctx->workers_pending) == 0);
2692 scrub_blocked_if_needed(fs_info); 2711 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2712
2713 mutex_lock(&fs_info->scrub_lock);
2714 __scrub_blocked_if_needed(fs_info);
2715 atomic_dec(&fs_info->scrubs_paused);
2716 mutex_unlock(&fs_info->scrub_lock);
2717 wake_up(&fs_info->scrub_pause_wait);
2693 2718
2694 btrfs_put_block_group(cache); 2719 btrfs_put_block_group(cache);
2695 if (ret) 2720 if (ret)
@@ -2757,33 +2782,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2757 int is_dev_replace) 2782 int is_dev_replace)
2758{ 2783{
2759 int ret = 0; 2784 int ret = 0;
2785 int flags = WQ_FREEZABLE | WQ_UNBOUND;
2786 int max_active = fs_info->thread_pool_size;
2760 2787
2761 if (fs_info->scrub_workers_refcnt == 0) { 2788 if (fs_info->scrub_workers_refcnt == 0) {
2762 if (is_dev_replace) 2789 if (is_dev_replace)
2763 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2790 fs_info->scrub_workers =
2764 &fs_info->generic_worker); 2791 btrfs_alloc_workqueue("btrfs-scrub", flags,
2792 1, 4);
2765 else 2793 else
2766 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2794 fs_info->scrub_workers =
2767 fs_info->thread_pool_size, 2795 btrfs_alloc_workqueue("btrfs-scrub", flags,
2768 &fs_info->generic_worker); 2796 max_active, 4);
2769 fs_info->scrub_workers.idle_thresh = 4; 2797 if (!fs_info->scrub_workers) {
2770 ret = btrfs_start_workers(&fs_info->scrub_workers); 2798 ret = -ENOMEM;
2771 if (ret)
2772 goto out; 2799 goto out;
2773 btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2800 }
2774 "scrubwrc", 2801 fs_info->scrub_wr_completion_workers =
2775 fs_info->thread_pool_size, 2802 btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
2776 &fs_info->generic_worker); 2803 max_active, 2);
2777 fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2804 if (!fs_info->scrub_wr_completion_workers) {
2778 ret = btrfs_start_workers( 2805 ret = -ENOMEM;
2779 &fs_info->scrub_wr_completion_workers);
2780 if (ret)
2781 goto out; 2806 goto out;
2782 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2807 }
2783 &fs_info->generic_worker); 2808 fs_info->scrub_nocow_workers =
2784 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2809 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
2785 if (ret) 2810 if (!fs_info->scrub_nocow_workers) {
2811 ret = -ENOMEM;
2786 goto out; 2812 goto out;
2813 }
2787 } 2814 }
2788 ++fs_info->scrub_workers_refcnt; 2815 ++fs_info->scrub_workers_refcnt;
2789out: 2816out:
@@ -2793,9 +2820,9 @@ out:
2793static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2820static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2794{ 2821{
2795 if (--fs_info->scrub_workers_refcnt == 0) { 2822 if (--fs_info->scrub_workers_refcnt == 0) {
2796 btrfs_stop_workers(&fs_info->scrub_workers); 2823 btrfs_destroy_workqueue(fs_info->scrub_workers);
2797 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2824 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
2798 btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2825 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
2799 } 2826 }
2800 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2827 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2801} 2828}
@@ -3106,10 +3133,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3106 nocow_ctx->len = len; 3133 nocow_ctx->len = len;
3107 nocow_ctx->mirror_num = mirror_num; 3134 nocow_ctx->mirror_num = mirror_num;
3108 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3135 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3109 nocow_ctx->work.func = copy_nocow_pages_worker; 3136 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
3110 INIT_LIST_HEAD(&nocow_ctx->inodes); 3137 INIT_LIST_HEAD(&nocow_ctx->inodes);
3111 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3138 btrfs_queue_work(fs_info->scrub_nocow_workers,
3112 &nocow_ctx->work); 3139 &nocow_ctx->work);
3113 3140
3114 return 0; 3141 return 0;
3115} 3142}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..9b6da9d55f9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
51 struct { 51 struct {
52 char *start; 52 char *start;
53 char *end; 53 char *end;
54 char *prepared;
55 54
56 char *buf; 55 char *buf;
57 int buf_len; 56 unsigned short buf_len:15;
58 unsigned int reversed:1; 57 unsigned short reversed:1;
59 unsigned int virtual_mem:1;
60 char inline_buf[]; 58 char inline_buf[];
61 }; 59 };
62 char pad[PAGE_SIZE]; 60 /*
61 * Average path length does not exceed 200 bytes, we'll have
62 * better packing in the slab and higher chance to satisfy
63 * a allocation later during send.
64 */
65 char pad[256];
63 }; 66 };
64}; 67};
65#define FS_PATH_INLINE_SIZE \ 68#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
109 int cur_inode_deleted; 112 int cur_inode_deleted;
110 u64 cur_inode_size; 113 u64 cur_inode_size;
111 u64 cur_inode_mode; 114 u64 cur_inode_mode;
115 u64 cur_inode_rdev;
112 u64 cur_inode_last_extent; 116 u64 cur_inode_last_extent;
113 117
114 u64 send_progress; 118 u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
120 struct list_head name_cache_list; 124 struct list_head name_cache_list;
121 int name_cache_size; 125 int name_cache_size;
122 126
127 struct file_ra_state ra;
128
123 char *read_buf; 129 char *read_buf;
124 130
125 /* 131 /*
@@ -175,6 +181,47 @@ struct send_ctx {
175 * own move/rename can be performed. 181 * own move/rename can be performed.
176 */ 182 */
177 struct rb_root waiting_dir_moves; 183 struct rb_root waiting_dir_moves;
184
185 /*
186 * A directory that is going to be rm'ed might have a child directory
187 * which is in the pending directory moves index above. In this case,
188 * the directory can only be removed after the move/rename of its child
189 * is performed. Example:
190 *
191 * Parent snapshot:
192 *
193 * . (ino 256)
194 * |-- a/ (ino 257)
195 * |-- b/ (ino 258)
196 * |-- c/ (ino 259)
197 * | |-- x/ (ino 260)
198 * |
199 * |-- y/ (ino 261)
200 *
201 * Send snapshot:
202 *
203 * . (ino 256)
204 * |-- a/ (ino 257)
205 * |-- b/ (ino 258)
206 * |-- YY/ (ino 261)
207 * |-- x/ (ino 260)
208 *
209 * Sequence of steps that lead to the send snapshot:
210 * rm -f /a/b/c/foo.txt
211 * mv /a/b/y /a/b/YY
212 * mv /a/b/c/x /a/b/YY
213 * rmdir /a/b/c
214 *
215 * When the child is processed, its move/rename is delayed until its
216 * parent is processed (as explained above), but all other operations
217 * like update utimes, chown, chgrp, etc, are performed and the paths
218 * that it uses for those operations must use the orphanized name of
219 * its parent (the directory we're going to rm later), so we need to
220 * memorize that name.
221 *
222 * Indexed by the inode number of the directory to be deleted.
223 */
224 struct rb_root orphan_dirs;
178}; 225};
179 226
180struct pending_dir_move { 227struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
189struct waiting_dir_move { 236struct waiting_dir_move {
190 struct rb_node node; 237 struct rb_node node;
191 u64 ino; 238 u64 ino;
239 /*
240 * There might be some directory that could not be removed because it
241 * was waiting for this directory inode to be moved first. Therefore
242 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
243 */
244 u64 rmdir_ino;
245};
246
247struct orphan_dir_info {
248 struct rb_node node;
249 u64 ino;
250 u64 gen;
192}; 251};
193 252
194struct name_cache_entry { 253struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
214 273
215static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 274static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
216 275
276static struct waiting_dir_move *
277get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
278
279static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
280
217static int need_send_hole(struct send_ctx *sctx) 281static int need_send_hole(struct send_ctx *sctx)
218{ 282{
219 return (sctx->parent_root && !sctx->cur_inode_new && 283 return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
242 if (!p) 306 if (!p)
243 return NULL; 307 return NULL;
244 p->reversed = 0; 308 p->reversed = 0;
245 p->virtual_mem = 0;
246 p->buf = p->inline_buf; 309 p->buf = p->inline_buf;
247 p->buf_len = FS_PATH_INLINE_SIZE; 310 p->buf_len = FS_PATH_INLINE_SIZE;
248 fs_path_reset(p); 311 fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
265{ 328{
266 if (!p) 329 if (!p)
267 return; 330 return;
268 if (p->buf != p->inline_buf) { 331 if (p->buf != p->inline_buf)
269 if (p->virtual_mem) 332 kfree(p->buf);
270 vfree(p->buf);
271 else
272 kfree(p->buf);
273 }
274 kfree(p); 333 kfree(p);
275} 334}
276 335
@@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
292 351
293 path_len = p->end - p->start; 352 path_len = p->end - p->start;
294 old_buf_len = p->buf_len; 353 old_buf_len = p->buf_len;
295 len = PAGE_ALIGN(len); 354
296 355 /*
297 if (p->buf == p->inline_buf) { 356 * First time the inline_buf does not suffice
298 tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); 357 */
299 if (!tmp_buf) { 358 if (p->buf == p->inline_buf)
300 tmp_buf = vmalloc(len); 359 tmp_buf = kmalloc(len, GFP_NOFS);
301 if (!tmp_buf) 360 else
302 return -ENOMEM; 361 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
303 p->virtual_mem = 1; 362 if (!tmp_buf)
304 } 363 return -ENOMEM;
305 memcpy(tmp_buf, p->buf, p->buf_len); 364 p->buf = tmp_buf;
306 p->buf = tmp_buf; 365 /*
307 p->buf_len = len; 366 * The real size of the buffer is bigger, this will let the fast path
308 } else { 367 * happen most of the time
309 if (p->virtual_mem) { 368 */
310 tmp_buf = vmalloc(len); 369 p->buf_len = ksize(p->buf);
311 if (!tmp_buf) 370
312 return -ENOMEM;
313 memcpy(tmp_buf, p->buf, p->buf_len);
314 vfree(p->buf);
315 } else {
316 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
317 if (!tmp_buf) {
318 tmp_buf = vmalloc(len);
319 if (!tmp_buf)
320 return -ENOMEM;
321 memcpy(tmp_buf, p->buf, p->buf_len);
322 kfree(p->buf);
323 p->virtual_mem = 1;
324 }
325 }
326 p->buf = tmp_buf;
327 p->buf_len = len;
328 }
329 if (p->reversed) { 371 if (p->reversed) {
330 tmp_buf = p->buf + old_buf_len - path_len - 1; 372 tmp_buf = p->buf + old_buf_len - path_len - 1;
331 p->end = p->buf + p->buf_len - 1; 373 p->end = p->buf + p->buf_len - 1;
@@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
338 return 0; 380 return 0;
339} 381}
340 382
341static int fs_path_prepare_for_add(struct fs_path *p, int name_len) 383static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
384 char **prepared)
342{ 385{
343 int ret; 386 int ret;
344 int new_len; 387 int new_len;
@@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
354 if (p->start != p->end) 397 if (p->start != p->end)
355 *--p->start = '/'; 398 *--p->start = '/';
356 p->start -= name_len; 399 p->start -= name_len;
357 p->prepared = p->start; 400 *prepared = p->start;
358 } else { 401 } else {
359 if (p->start != p->end) 402 if (p->start != p->end)
360 *p->end++ = '/'; 403 *p->end++ = '/';
361 p->prepared = p->end; 404 *prepared = p->end;
362 p->end += name_len; 405 p->end += name_len;
363 *p->end = 0; 406 *p->end = 0;
364 } 407 }
@@ -370,12 +413,12 @@ out:
370static int fs_path_add(struct fs_path *p, const char *name, int name_len) 413static int fs_path_add(struct fs_path *p, const char *name, int name_len)
371{ 414{
372 int ret; 415 int ret;
416 char *prepared;
373 417
374 ret = fs_path_prepare_for_add(p, name_len); 418 ret = fs_path_prepare_for_add(p, name_len, &prepared);
375 if (ret < 0) 419 if (ret < 0)
376 goto out; 420 goto out;
377 memcpy(p->prepared, name, name_len); 421 memcpy(prepared, name, name_len);
378 p->prepared = NULL;
379 422
380out: 423out:
381 return ret; 424 return ret;
@@ -384,12 +427,12 @@ out:
384static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 427static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
385{ 428{
386 int ret; 429 int ret;
430 char *prepared;
387 431
388 ret = fs_path_prepare_for_add(p, p2->end - p2->start); 432 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
389 if (ret < 0) 433 if (ret < 0)
390 goto out; 434 goto out;
391 memcpy(p->prepared, p2->start, p2->end - p2->start); 435 memcpy(prepared, p2->start, p2->end - p2->start);
392 p->prepared = NULL;
393 436
394out: 437out:
395 return ret; 438 return ret;
@@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
400 unsigned long off, int len) 443 unsigned long off, int len)
401{ 444{
402 int ret; 445 int ret;
446 char *prepared;
403 447
404 ret = fs_path_prepare_for_add(p, len); 448 ret = fs_path_prepare_for_add(p, len, &prepared);
405 if (ret < 0) 449 if (ret < 0)
406 goto out; 450 goto out;
407 451
408 read_extent_buffer(eb, p->prepared, off, len); 452 read_extent_buffer(eb, prepared, off, len);
409 p->prepared = NULL;
410 453
411out: 454out:
412 return ret; 455 return ret;
@@ -915,9 +958,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
915 struct btrfs_dir_item *di; 958 struct btrfs_dir_item *di;
916 struct btrfs_key di_key; 959 struct btrfs_key di_key;
917 char *buf = NULL; 960 char *buf = NULL;
918 char *buf2 = NULL; 961 const int buf_len = PATH_MAX;
919 int buf_len;
920 int buf_virtual = 0;
921 u32 name_len; 962 u32 name_len;
922 u32 data_len; 963 u32 data_len;
923 u32 cur; 964 u32 cur;
@@ -927,7 +968,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
927 int num; 968 int num;
928 u8 type; 969 u8 type;
929 970
930 buf_len = PAGE_SIZE;
931 buf = kmalloc(buf_len, GFP_NOFS); 971 buf = kmalloc(buf_len, GFP_NOFS);
932 if (!buf) { 972 if (!buf) {
933 ret = -ENOMEM; 973 ret = -ENOMEM;
@@ -949,30 +989,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
949 type = btrfs_dir_type(eb, di); 989 type = btrfs_dir_type(eb, di);
950 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 990 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
951 991
992 /*
993 * Path too long
994 */
952 if (name_len + data_len > buf_len) { 995 if (name_len + data_len > buf_len) {
953 buf_len = PAGE_ALIGN(name_len + data_len); 996 ret = -ENAMETOOLONG;
954 if (buf_virtual) { 997 goto out;
955 buf2 = vmalloc(buf_len);
956 if (!buf2) {
957 ret = -ENOMEM;
958 goto out;
959 }
960 vfree(buf);
961 } else {
962 buf2 = krealloc(buf, buf_len, GFP_NOFS);
963 if (!buf2) {
964 buf2 = vmalloc(buf_len);
965 if (!buf2) {
966 ret = -ENOMEM;
967 goto out;
968 }
969 kfree(buf);
970 buf_virtual = 1;
971 }
972 }
973
974 buf = buf2;
975 buf2 = NULL;
976 } 998 }
977 999
978 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1000 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1017,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 } 1017 }
996 1018
997out: 1019out:
998 if (buf_virtual) 1020 kfree(buf);
999 vfree(buf);
1000 else
1001 kfree(buf);
1002 return ret; 1021 return ret;
1003} 1022}
1004 1023
@@ -1292,8 +1311,6 @@ static int find_extent_clone(struct send_ctx *sctx,
1292 extent_item_pos = logical - found_key.objectid; 1311 extent_item_pos = logical - found_key.objectid;
1293 else 1312 else
1294 extent_item_pos = 0; 1313 extent_item_pos = 0;
1295
1296 extent_item_pos = logical - found_key.objectid;
1297 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1314 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1298 found_key.objectid, extent_item_pos, 1, 1315 found_key.objectid, extent_item_pos, 1,
1299 __iterate_backrefs, backref_ctx); 1316 __iterate_backrefs, backref_ctx);
@@ -1418,11 +1435,7 @@ static int gen_unique_name(struct send_ctx *sctx,
1418 while (1) { 1435 while (1) {
1419 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1436 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1420 ino, gen, idx); 1437 ino, gen, idx);
1421 if (len >= sizeof(tmp)) { 1438 ASSERT(len < sizeof(tmp));
1422 /* should really not happen */
1423 ret = -EOVERFLOW;
1424 goto out;
1425 }
1426 1439
1427 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1440 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1428 path, BTRFS_FIRST_FREE_OBJECTID, 1441 path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1898,13 +1911,20 @@ static void name_cache_delete(struct send_ctx *sctx,
1898 1911
1899 nce_head = radix_tree_lookup(&sctx->name_cache, 1912 nce_head = radix_tree_lookup(&sctx->name_cache,
1900 (unsigned long)nce->ino); 1913 (unsigned long)nce->ino);
1901 BUG_ON(!nce_head); 1914 if (!nce_head) {
1915 btrfs_err(sctx->send_root->fs_info,
1916 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
1917 nce->ino, sctx->name_cache_size);
1918 }
1902 1919
1903 list_del(&nce->radix_list); 1920 list_del(&nce->radix_list);
1904 list_del(&nce->list); 1921 list_del(&nce->list);
1905 sctx->name_cache_size--; 1922 sctx->name_cache_size--;
1906 1923
1907 if (list_empty(nce_head)) { 1924 /*
1925 * We may not get to the final release of nce_head if the lookup fails
1926 */
1927 if (nce_head && list_empty(nce_head)) {
1908 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 1928 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1909 kfree(nce_head); 1929 kfree(nce_head);
1910 } 1930 }
@@ -1977,7 +1997,6 @@ static void name_cache_free(struct send_ctx *sctx)
1977 */ 1997 */
1978static int __get_cur_name_and_parent(struct send_ctx *sctx, 1998static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 u64 ino, u64 gen, 1999 u64 ino, u64 gen,
1980 int skip_name_cache,
1981 u64 *parent_ino, 2000 u64 *parent_ino,
1982 u64 *parent_gen, 2001 u64 *parent_gen,
1983 struct fs_path *dest) 2002 struct fs_path *dest)
@@ -1987,8 +2006,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1987 struct btrfs_path *path = NULL; 2006 struct btrfs_path *path = NULL;
1988 struct name_cache_entry *nce = NULL; 2007 struct name_cache_entry *nce = NULL;
1989 2008
1990 if (skip_name_cache)
1991 goto get_ref;
1992 /* 2009 /*
1993 * First check if we already did a call to this function with the same 2010 * First check if we already did a call to this function with the same
1994 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2011 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2050,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2033 goto out_cache; 2050 goto out_cache;
2034 } 2051 }
2035 2052
2036get_ref:
2037 /* 2053 /*
2038 * Depending on whether the inode was already processed or not, use 2054 * Depending on whether the inode was already processed or not, use
2039 * send_root or parent_root for ref lookup. 2055 * send_root or parent_root for ref lookup.
2040 */ 2056 */
2041 if (ino < sctx->send_progress && !skip_name_cache) 2057 if (ino < sctx->send_progress)
2042 ret = get_first_ref(sctx->send_root, ino, 2058 ret = get_first_ref(sctx->send_root, ino,
2043 parent_ino, parent_gen, dest); 2059 parent_ino, parent_gen, dest);
2044 else 2060 else
@@ -2062,8 +2078,6 @@ get_ref:
2062 goto out; 2078 goto out;
2063 ret = 1; 2079 ret = 1;
2064 } 2080 }
2065 if (skip_name_cache)
2066 goto out;
2067 2081
2068out_cache: 2082out_cache:
2069 /* 2083 /*
@@ -2131,9 +2145,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2131 u64 parent_inode = 0; 2145 u64 parent_inode = 0;
2132 u64 parent_gen = 0; 2146 u64 parent_gen = 0;
2133 int stop = 0; 2147 int stop = 0;
2134 u64 start_ino = ino;
2135 u64 start_gen = gen;
2136 int skip_name_cache = 0;
2137 2148
2138 name = fs_path_alloc(); 2149 name = fs_path_alloc();
2139 if (!name) { 2150 if (!name) {
@@ -2141,31 +2152,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2141 goto out; 2152 goto out;
2142 } 2153 }
2143 2154
2144 if (is_waiting_for_move(sctx, ino))
2145 skip_name_cache = 1;
2146
2147again:
2148 dest->reversed = 1; 2155 dest->reversed = 1;
2149 fs_path_reset(dest); 2156 fs_path_reset(dest);
2150 2157
2151 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2158 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2152 fs_path_reset(name); 2159 fs_path_reset(name);
2153 2160
2154 ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, 2161 if (is_waiting_for_rm(sctx, ino)) {
2155 &parent_inode, &parent_gen, name); 2162 ret = gen_unique_name(sctx, ino, gen, name);
2163 if (ret < 0)
2164 goto out;
2165 ret = fs_path_add_path(dest, name);
2166 break;
2167 }
2168
2169 if (is_waiting_for_move(sctx, ino)) {
2170 ret = get_first_ref(sctx->parent_root, ino,
2171 &parent_inode, &parent_gen, name);
2172 } else {
2173 ret = __get_cur_name_and_parent(sctx, ino, gen,
2174 &parent_inode,
2175 &parent_gen, name);
2176 if (ret)
2177 stop = 1;
2178 }
2179
2156 if (ret < 0) 2180 if (ret < 0)
2157 goto out; 2181 goto out;
2158 if (ret)
2159 stop = 1;
2160
2161 if (!skip_name_cache &&
2162 is_waiting_for_move(sctx, parent_inode)) {
2163 ino = start_ino;
2164 gen = start_gen;
2165 stop = 0;
2166 skip_name_cache = 1;
2167 goto again;
2168 }
2169 2182
2170 ret = fs_path_add_path(dest, name); 2183 ret = fs_path_add_path(dest, name);
2171 if (ret < 0) 2184 if (ret < 0)
@@ -2429,10 +2442,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2429 if (!p) 2442 if (!p)
2430 return -ENOMEM; 2443 return -ENOMEM;
2431 2444
2432 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, 2445 if (ino != sctx->cur_ino) {
2433 NULL, &rdev); 2446 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2434 if (ret < 0) 2447 NULL, NULL, &rdev);
2435 goto out; 2448 if (ret < 0)
2449 goto out;
2450 } else {
2451 gen = sctx->cur_inode_gen;
2452 mode = sctx->cur_inode_mode;
2453 rdev = sctx->cur_inode_rdev;
2454 }
2436 2455
2437 if (S_ISREG(mode)) { 2456 if (S_ISREG(mode)) {
2438 cmd = BTRFS_SEND_C_MKFILE; 2457 cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2531,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2512 key.objectid = dir; 2531 key.objectid = dir;
2513 key.type = BTRFS_DIR_INDEX_KEY; 2532 key.type = BTRFS_DIR_INDEX_KEY;
2514 key.offset = 0; 2533 key.offset = 0;
2534 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2535 if (ret < 0)
2536 goto out;
2537
2515 while (1) { 2538 while (1) {
2516 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, 2539 eb = path->nodes[0];
2517 1, 0); 2540 slot = path->slots[0];
2518 if (ret < 0) 2541 if (slot >= btrfs_header_nritems(eb)) {
2519 goto out; 2542 ret = btrfs_next_leaf(sctx->send_root, path);
2520 if (!ret) { 2543 if (ret < 0) {
2521 eb = path->nodes[0]; 2544 goto out;
2522 slot = path->slots[0]; 2545 } else if (ret > 0) {
2523 btrfs_item_key_to_cpu(eb, &found_key, slot); 2546 ret = 0;
2547 break;
2548 }
2549 continue;
2524 } 2550 }
2525 if (ret || found_key.objectid != key.objectid || 2551
2552 btrfs_item_key_to_cpu(eb, &found_key, slot);
2553 if (found_key.objectid != key.objectid ||
2526 found_key.type != key.type) { 2554 found_key.type != key.type) {
2527 ret = 0; 2555 ret = 0;
2528 goto out; 2556 goto out;
@@ -2537,8 +2565,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2537 goto out; 2565 goto out;
2538 } 2566 }
2539 2567
2540 key.offset = found_key.offset + 1; 2568 path->slots[0]++;
2541 btrfs_release_path(path);
2542 } 2569 }
2543 2570
2544out: 2571out:
@@ -2590,7 +2617,7 @@ struct recorded_ref {
2590 * everything mixed. So we first record all refs and later process them. 2617 * everything mixed. So we first record all refs and later process them.
2591 * This function is a helper to record one ref. 2618 * This function is a helper to record one ref.
2592 */ 2619 */
2593static int record_ref(struct list_head *head, u64 dir, 2620static int __record_ref(struct list_head *head, u64 dir,
2594 u64 dir_gen, struct fs_path *path) 2621 u64 dir_gen, struct fs_path *path)
2595{ 2622{
2596 struct recorded_ref *ref; 2623 struct recorded_ref *ref;
@@ -2676,12 +2703,78 @@ out:
2676 return ret; 2703 return ret;
2677} 2704}
2678 2705
2706static struct orphan_dir_info *
2707add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2708{
2709 struct rb_node **p = &sctx->orphan_dirs.rb_node;
2710 struct rb_node *parent = NULL;
2711 struct orphan_dir_info *entry, *odi;
2712
2713 odi = kmalloc(sizeof(*odi), GFP_NOFS);
2714 if (!odi)
2715 return ERR_PTR(-ENOMEM);
2716 odi->ino = dir_ino;
2717 odi->gen = 0;
2718
2719 while (*p) {
2720 parent = *p;
2721 entry = rb_entry(parent, struct orphan_dir_info, node);
2722 if (dir_ino < entry->ino) {
2723 p = &(*p)->rb_left;
2724 } else if (dir_ino > entry->ino) {
2725 p = &(*p)->rb_right;
2726 } else {
2727 kfree(odi);
2728 return entry;
2729 }
2730 }
2731
2732 rb_link_node(&odi->node, parent, p);
2733 rb_insert_color(&odi->node, &sctx->orphan_dirs);
2734 return odi;
2735}
2736
2737static struct orphan_dir_info *
2738get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2739{
2740 struct rb_node *n = sctx->orphan_dirs.rb_node;
2741 struct orphan_dir_info *entry;
2742
2743 while (n) {
2744 entry = rb_entry(n, struct orphan_dir_info, node);
2745 if (dir_ino < entry->ino)
2746 n = n->rb_left;
2747 else if (dir_ino > entry->ino)
2748 n = n->rb_right;
2749 else
2750 return entry;
2751 }
2752 return NULL;
2753}
2754
2755static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
2756{
2757 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
2758
2759 return odi != NULL;
2760}
2761
2762static void free_orphan_dir_info(struct send_ctx *sctx,
2763 struct orphan_dir_info *odi)
2764{
2765 if (!odi)
2766 return;
2767 rb_erase(&odi->node, &sctx->orphan_dirs);
2768 kfree(odi);
2769}
2770
2679/* 2771/*
2680 * Returns 1 if a directory can be removed at this point in time. 2772 * Returns 1 if a directory can be removed at this point in time.
2681 * We check this by iterating all dir items and checking if the inode behind 2773 * We check this by iterating all dir items and checking if the inode behind
2682 * the dir item was already processed. 2774 * the dir item was already processed.
2683 */ 2775 */
2684static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) 2776static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2777 u64 send_progress)
2685{ 2778{
2686 int ret = 0; 2779 int ret = 0;
2687 struct btrfs_root *root = sctx->parent_root; 2780 struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2797,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2704 key.objectid = dir; 2797 key.objectid = dir;
2705 key.type = BTRFS_DIR_INDEX_KEY; 2798 key.type = BTRFS_DIR_INDEX_KEY;
2706 key.offset = 0; 2799 key.offset = 0;
2800 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2801 if (ret < 0)
2802 goto out;
2707 2803
2708 while (1) { 2804 while (1) {
2709 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2805 struct waiting_dir_move *dm;
2710 if (ret < 0) 2806
2711 goto out; 2807 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2712 if (!ret) { 2808 ret = btrfs_next_leaf(root, path);
2713 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2809 if (ret < 0)
2714 path->slots[0]); 2810 goto out;
2811 else if (ret > 0)
2812 break;
2813 continue;
2715 } 2814 }
2716 if (ret || found_key.objectid != key.objectid || 2815 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2717 found_key.type != key.type) { 2816 path->slots[0]);
2817 if (found_key.objectid != key.objectid ||
2818 found_key.type != key.type)
2718 break; 2819 break;
2719 }
2720 2820
2721 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 2821 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2722 struct btrfs_dir_item); 2822 struct btrfs_dir_item);
2723 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 2823 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2724 2824
2825 dm = get_waiting_dir_move(sctx, loc.objectid);
2826 if (dm) {
2827 struct orphan_dir_info *odi;
2828
2829 odi = add_orphan_dir_info(sctx, dir);
2830 if (IS_ERR(odi)) {
2831 ret = PTR_ERR(odi);
2832 goto out;
2833 }
2834 odi->gen = dir_gen;
2835 dm->rmdir_ino = dir;
2836 ret = 0;
2837 goto out;
2838 }
2839
2725 if (loc.objectid > send_progress) { 2840 if (loc.objectid > send_progress) {
2726 ret = 0; 2841 ret = 0;
2727 goto out; 2842 goto out;
2728 } 2843 }
2729 2844
2730 btrfs_release_path(path); 2845 path->slots[0]++;
2731 key.offset = found_key.offset + 1;
2732 } 2846 }
2733 2847
2734 ret = 1; 2848 ret = 1;
@@ -2740,19 +2854,9 @@ out:
2740 2854
2741static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 2855static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
2742{ 2856{
2743 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2857 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
2744 struct waiting_dir_move *entry;
2745 2858
2746 while (n) { 2859 return entry != NULL;
2747 entry = rb_entry(n, struct waiting_dir_move, node);
2748 if (ino < entry->ino)
2749 n = n->rb_left;
2750 else if (ino > entry->ino)
2751 n = n->rb_right;
2752 else
2753 return 1;
2754 }
2755 return 0;
2756} 2860}
2757 2861
2758static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2862static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2869,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2765 if (!dm) 2869 if (!dm)
2766 return -ENOMEM; 2870 return -ENOMEM;
2767 dm->ino = ino; 2871 dm->ino = ino;
2872 dm->rmdir_ino = 0;
2768 2873
2769 while (*p) { 2874 while (*p) {
2770 parent = *p; 2875 parent = *p;
@@ -2784,31 +2889,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2784 return 0; 2889 return 0;
2785} 2890}
2786 2891
2787static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2892static struct waiting_dir_move *
2893get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2788{ 2894{
2789 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2895 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2790 struct waiting_dir_move *entry; 2896 struct waiting_dir_move *entry;
2791 2897
2792 while (n) { 2898 while (n) {
2793 entry = rb_entry(n, struct waiting_dir_move, node); 2899 entry = rb_entry(n, struct waiting_dir_move, node);
2794 if (ino < entry->ino) { 2900 if (ino < entry->ino)
2795 n = n->rb_left; 2901 n = n->rb_left;
2796 } else if (ino > entry->ino) { 2902 else if (ino > entry->ino)
2797 n = n->rb_right; 2903 n = n->rb_right;
2798 } else { 2904 else
2799 rb_erase(&entry->node, &sctx->waiting_dir_moves); 2905 return entry;
2800 kfree(entry);
2801 return 0;
2802 }
2803 } 2906 }
2804 return -ENOENT; 2907 return NULL;
2908}
2909
2910static void free_waiting_dir_move(struct send_ctx *sctx,
2911 struct waiting_dir_move *dm)
2912{
2913 if (!dm)
2914 return;
2915 rb_erase(&dm->node, &sctx->waiting_dir_moves);
2916 kfree(dm);
2805} 2917}
2806 2918
2807static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) 2919static int add_pending_dir_move(struct send_ctx *sctx,
2920 u64 ino,
2921 u64 ino_gen,
2922 u64 parent_ino)
2808{ 2923{
2809 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2924 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2810 struct rb_node *parent = NULL; 2925 struct rb_node *parent = NULL;
2811 struct pending_dir_move *entry, *pm; 2926 struct pending_dir_move *entry = NULL, *pm;
2812 struct recorded_ref *cur; 2927 struct recorded_ref *cur;
2813 int exists = 0; 2928 int exists = 0;
2814 int ret; 2929 int ret;
@@ -2817,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
2817 if (!pm) 2932 if (!pm)
2818 return -ENOMEM; 2933 return -ENOMEM;
2819 pm->parent_ino = parent_ino; 2934 pm->parent_ino = parent_ino;
2820 pm->ino = sctx->cur_ino; 2935 pm->ino = ino;
2821 pm->gen = sctx->cur_inode_gen; 2936 pm->gen = ino_gen;
2822 INIT_LIST_HEAD(&pm->list); 2937 INIT_LIST_HEAD(&pm->list);
2823 INIT_LIST_HEAD(&pm->update_refs); 2938 INIT_LIST_HEAD(&pm->update_refs);
2824 RB_CLEAR_NODE(&pm->node); 2939 RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3003,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2888{ 3003{
2889 struct fs_path *from_path = NULL; 3004 struct fs_path *from_path = NULL;
2890 struct fs_path *to_path = NULL; 3005 struct fs_path *to_path = NULL;
3006 struct fs_path *name = NULL;
2891 u64 orig_progress = sctx->send_progress; 3007 u64 orig_progress = sctx->send_progress;
2892 struct recorded_ref *cur; 3008 struct recorded_ref *cur;
3009 u64 parent_ino, parent_gen;
3010 struct waiting_dir_move *dm = NULL;
3011 u64 rmdir_ino = 0;
2893 int ret; 3012 int ret;
2894 3013
3014 name = fs_path_alloc();
2895 from_path = fs_path_alloc(); 3015 from_path = fs_path_alloc();
2896 if (!from_path) 3016 if (!name || !from_path) {
2897 return -ENOMEM; 3017 ret = -ENOMEM;
3018 goto out;
3019 }
2898 3020
2899 sctx->send_progress = pm->ino; 3021 dm = get_waiting_dir_move(sctx, pm->ino);
2900 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3022 ASSERT(dm);
3023 rmdir_ino = dm->rmdir_ino;
3024 free_waiting_dir_move(sctx, dm);
3025
3026 ret = get_first_ref(sctx->parent_root, pm->ino,
3027 &parent_ino, &parent_gen, name);
2901 if (ret < 0) 3028 if (ret < 0)
2902 goto out; 3029 goto out;
2903 3030
3031 if (parent_ino == sctx->cur_ino) {
3032 /* child only renamed, not moved */
3033 ASSERT(parent_gen == sctx->cur_inode_gen);
3034 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3035 from_path);
3036 if (ret < 0)
3037 goto out;
3038 ret = fs_path_add_path(from_path, name);
3039 if (ret < 0)
3040 goto out;
3041 } else {
3042 /* child moved and maybe renamed too */
3043 sctx->send_progress = pm->ino;
3044 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
3045 if (ret < 0)
3046 goto out;
3047 }
3048
3049 fs_path_free(name);
3050 name = NULL;
3051
2904 to_path = fs_path_alloc(); 3052 to_path = fs_path_alloc();
2905 if (!to_path) { 3053 if (!to_path) {
2906 ret = -ENOMEM; 3054 ret = -ENOMEM;
@@ -2908,9 +3056,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2908 } 3056 }
2909 3057
2910 sctx->send_progress = sctx->cur_ino + 1; 3058 sctx->send_progress = sctx->cur_ino + 1;
2911 ret = del_waiting_dir_move(sctx, pm->ino);
2912 ASSERT(ret == 0);
2913
2914 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3059 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
2915 if (ret < 0) 3060 if (ret < 0)
2916 goto out; 3061 goto out;
@@ -2919,6 +3064,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2919 if (ret < 0) 3064 if (ret < 0)
2920 goto out; 3065 goto out;
2921 3066
3067 if (rmdir_ino) {
3068 struct orphan_dir_info *odi;
3069
3070 odi = get_orphan_dir_info(sctx, rmdir_ino);
3071 if (!odi) {
3072 /* already deleted */
3073 goto finish;
3074 }
3075 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
3076 if (ret < 0)
3077 goto out;
3078 if (!ret)
3079 goto finish;
3080
3081 name = fs_path_alloc();
3082 if (!name) {
3083 ret = -ENOMEM;
3084 goto out;
3085 }
3086 ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
3087 if (ret < 0)
3088 goto out;
3089 ret = send_rmdir(sctx, name);
3090 if (ret < 0)
3091 goto out;
3092 free_orphan_dir_info(sctx, odi);
3093 }
3094
3095finish:
2922 ret = send_utimes(sctx, pm->ino, pm->gen); 3096 ret = send_utimes(sctx, pm->ino, pm->gen);
2923 if (ret < 0) 3097 if (ret < 0)
2924 goto out; 3098 goto out;
@@ -2928,12 +3102,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2928 * and old parent(s). 3102 * and old parent(s).
2929 */ 3103 */
2930 list_for_each_entry(cur, &pm->update_refs, list) { 3104 list_for_each_entry(cur, &pm->update_refs, list) {
3105 if (cur->dir == rmdir_ino)
3106 continue;
2931 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3107 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
2932 if (ret < 0) 3108 if (ret < 0)
2933 goto out; 3109 goto out;
2934 } 3110 }
2935 3111
2936out: 3112out:
3113 fs_path_free(name);
2937 fs_path_free(from_path); 3114 fs_path_free(from_path);
2938 fs_path_free(to_path); 3115 fs_path_free(to_path);
2939 sctx->send_progress = orig_progress; 3116 sctx->send_progress = orig_progress;
@@ -3005,17 +3182,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3005 int ret; 3182 int ret;
3006 u64 ino = parent_ref->dir; 3183 u64 ino = parent_ref->dir;
3007 u64 parent_ino_before, parent_ino_after; 3184 u64 parent_ino_before, parent_ino_after;
3008 u64 new_gen, old_gen; 3185 u64 old_gen;
3009 struct fs_path *path_before = NULL; 3186 struct fs_path *path_before = NULL;
3010 struct fs_path *path_after = NULL; 3187 struct fs_path *path_after = NULL;
3011 int len1, len2; 3188 int len1, len2;
3012 3189 int register_upper_dirs;
3013 if (parent_ref->dir <= sctx->cur_ino) 3190 u64 gen;
3014 return 0;
3015 3191
3016 if (is_waiting_for_move(sctx, ino)) 3192 if (is_waiting_for_move(sctx, ino))
3017 return 1; 3193 return 1;
3018 3194
3195 if (parent_ref->dir <= sctx->cur_ino)
3196 return 0;
3197
3019 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, 3198 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3020 NULL, NULL, NULL, NULL); 3199 NULL, NULL, NULL, NULL);
3021 if (ret == -ENOENT) 3200 if (ret == -ENOENT)
@@ -3023,12 +3202,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3023 else if (ret < 0) 3202 else if (ret < 0)
3024 return ret; 3203 return ret;
3025 3204
3026 ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, 3205 if (parent_ref->dir_gen != old_gen)
3027 NULL, NULL, NULL, NULL);
3028 if (ret < 0)
3029 return ret;
3030
3031 if (new_gen != old_gen)
3032 return 0; 3206 return 0;
3033 3207
3034 path_before = fs_path_alloc(); 3208 path_before = fs_path_alloc();
@@ -3051,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3051 } 3225 }
3052 3226
3053 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3227 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3054 NULL, path_after); 3228 &gen, path_after);
3055 if (ret == -ENOENT) { 3229 if (ret == -ENOENT) {
3056 ret = 0; 3230 ret = 0;
3057 goto out; 3231 goto out;
@@ -3061,13 +3235,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3061 3235
3062 len1 = fs_path_len(path_before); 3236 len1 = fs_path_len(path_before);
3063 len2 = fs_path_len(path_after); 3237 len2 = fs_path_len(path_after);
3064 if ((parent_ino_before != parent_ino_after) && (len1 != len2 || 3238 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3065 memcmp(path_before->start, path_after->start, len1))) { 3239 memcmp(path_before->start, path_after->start, len1)) {
3066 ret = 1; 3240 ret = 1;
3067 goto out; 3241 goto out;
3068 } 3242 }
3069 ret = 0; 3243 ret = 0;
3070 3244
3245 /*
3246 * Ok, our new most direct ancestor has a higher inode number but
3247 * wasn't moved/renamed. So maybe some of the new ancestors higher in
3248 * the hierarchy have an higher inode number too *and* were renamed
3249 * or moved - in this case we need to wait for the ancestor's rename
3250 * or move operation before we can do the move/rename for the current
3251 * inode.
3252 */
3253 register_upper_dirs = 0;
3254 ino = parent_ino_after;
3255again:
3256 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
3257 u64 parent_gen;
3258
3259 fs_path_reset(path_before);
3260 fs_path_reset(path_after);
3261
3262 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3263 &parent_gen, path_after);
3264 if (ret < 0)
3265 goto out;
3266 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3267 NULL, path_before);
3268 if (ret == -ENOENT) {
3269 ret = 0;
3270 break;
3271 } else if (ret < 0) {
3272 goto out;
3273 }
3274
3275 len1 = fs_path_len(path_before);
3276 len2 = fs_path_len(path_after);
3277 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3278 memcmp(path_before->start, path_after->start, len1)) {
3279 ret = 1;
3280 if (register_upper_dirs) {
3281 break;
3282 } else {
3283 register_upper_dirs = 1;
3284 ino = parent_ref->dir;
3285 gen = parent_ref->dir_gen;
3286 goto again;
3287 }
3288 } else if (register_upper_dirs) {
3289 ret = add_pending_dir_move(sctx, ino, gen,
3290 parent_ino_after);
3291 if (ret < 0 && ret != -EEXIST)
3292 goto out;
3293 }
3294
3295 ino = parent_ino_after;
3296 gen = parent_gen;
3297 }
3298
3071out: 3299out:
3072 fs_path_free(path_before); 3300 fs_path_free(path_before);
3073 fs_path_free(path_after); 3301 fs_path_free(path_after);
@@ -3089,6 +3317,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3089 u64 ow_gen; 3317 u64 ow_gen;
3090 int did_overwrite = 0; 3318 int did_overwrite = 0;
3091 int is_orphan = 0; 3319 int is_orphan = 0;
3320 u64 last_dir_ino_rm = 0;
3092 3321
3093verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 3322verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3094 3323
@@ -3227,9 +3456,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3227 * dirs, we always have one new and one deleted 3456 * dirs, we always have one new and one deleted
3228 * ref. The deleted ref is ignored later. 3457 * ref. The deleted ref is ignored later.
3229 */ 3458 */
3230 if (wait_for_parent_move(sctx, cur)) { 3459 ret = wait_for_parent_move(sctx, cur);
3460 if (ret < 0)
3461 goto out;
3462 if (ret) {
3231 ret = add_pending_dir_move(sctx, 3463 ret = add_pending_dir_move(sctx,
3232 cur->dir); 3464 sctx->cur_ino,
3465 sctx->cur_inode_gen,
3466 cur->dir);
3233 *pending_move = 1; 3467 *pending_move = 1;
3234 } else { 3468 } else {
3235 ret = send_rename(sctx, valid_path, 3469 ret = send_rename(sctx, valid_path,
@@ -3259,7 +3493,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3259 * later, we do this check again and rmdir it then if possible. 3493 * later, we do this check again and rmdir it then if possible.
3260 * See the use of check_dirs for more details. 3494 * See the use of check_dirs for more details.
3261 */ 3495 */
3262 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); 3496 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3497 sctx->cur_ino);
3263 if (ret < 0) 3498 if (ret < 0)
3264 goto out; 3499 goto out;
3265 if (ret) { 3500 if (ret) {
@@ -3350,8 +3585,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3350 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3585 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3351 if (ret < 0) 3586 if (ret < 0)
3352 goto out; 3587 goto out;
3353 } else if (ret == inode_state_did_delete) { 3588 } else if (ret == inode_state_did_delete &&
3354 ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); 3589 cur->dir != last_dir_ino_rm) {
3590 ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
3591 sctx->cur_ino);
3355 if (ret < 0) 3592 if (ret < 0)
3356 goto out; 3593 goto out;
3357 if (ret) { 3594 if (ret) {
@@ -3362,6 +3599,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3362 ret = send_rmdir(sctx, valid_path); 3599 ret = send_rmdir(sctx, valid_path);
3363 if (ret < 0) 3600 if (ret < 0)
3364 goto out; 3601 goto out;
3602 last_dir_ino_rm = cur->dir;
3365 } 3603 }
3366 } 3604 }
3367 } 3605 }
@@ -3375,9 +3613,8 @@ out:
3375 return ret; 3613 return ret;
3376} 3614}
3377 3615
3378static int __record_new_ref(int num, u64 dir, int index, 3616static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
3379 struct fs_path *name, 3617 struct fs_path *name, void *ctx, struct list_head *refs)
3380 void *ctx)
3381{ 3618{
3382 int ret = 0; 3619 int ret = 0;
3383 struct send_ctx *sctx = ctx; 3620 struct send_ctx *sctx = ctx;
@@ -3388,7 +3625,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3388 if (!p) 3625 if (!p)
3389 return -ENOMEM; 3626 return -ENOMEM;
3390 3627
3391 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3628 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
3392 NULL, NULL); 3629 NULL, NULL);
3393 if (ret < 0) 3630 if (ret < 0)
3394 goto out; 3631 goto out;
@@ -3400,7 +3637,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3400 if (ret < 0) 3637 if (ret < 0)
3401 goto out; 3638 goto out;
3402 3639
3403 ret = record_ref(&sctx->new_refs, dir, gen, p); 3640 ret = __record_ref(refs, dir, gen, p);
3404 3641
3405out: 3642out:
3406 if (ret) 3643 if (ret)
@@ -3408,37 +3645,23 @@ out:
3408 return ret; 3645 return ret;
3409} 3646}
3410 3647
3648static int __record_new_ref(int num, u64 dir, int index,
3649 struct fs_path *name,
3650 void *ctx)
3651{
3652 struct send_ctx *sctx = ctx;
3653 return record_ref(sctx->send_root, num, dir, index, name,
3654 ctx, &sctx->new_refs);
3655}
3656
3657
3411static int __record_deleted_ref(int num, u64 dir, int index, 3658static int __record_deleted_ref(int num, u64 dir, int index,
3412 struct fs_path *name, 3659 struct fs_path *name,
3413 void *ctx) 3660 void *ctx)
3414{ 3661{
3415 int ret = 0;
3416 struct send_ctx *sctx = ctx; 3662 struct send_ctx *sctx = ctx;
3417 struct fs_path *p; 3663 return record_ref(sctx->parent_root, num, dir, index, name,
3418 u64 gen; 3664 ctx, &sctx->deleted_refs);
3419
3420 p = fs_path_alloc();
3421 if (!p)
3422 return -ENOMEM;
3423
3424 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3425 NULL, NULL);
3426 if (ret < 0)
3427 goto out;
3428
3429 ret = get_cur_path(sctx, dir, gen, p);
3430 if (ret < 0)
3431 goto out;
3432 ret = fs_path_add_path(p, name);
3433 if (ret < 0)
3434 goto out;
3435
3436 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3437
3438out:
3439 if (ret)
3440 fs_path_free(p);
3441 return ret;
3442} 3665}
3443 3666
3444static int record_new_ref(struct send_ctx *sctx) 3667static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3842,31 @@ static int process_all_refs(struct send_ctx *sctx,
3619 root = sctx->parent_root; 3842 root = sctx->parent_root;
3620 cb = __record_deleted_ref; 3843 cb = __record_deleted_ref;
3621 } else { 3844 } else {
3622 BUG(); 3845 btrfs_err(sctx->send_root->fs_info,
3846 "Wrong command %d in process_all_refs", cmd);
3847 ret = -EINVAL;
3848 goto out;
3623 } 3849 }
3624 3850
3625 key.objectid = sctx->cmp_key->objectid; 3851 key.objectid = sctx->cmp_key->objectid;
3626 key.type = BTRFS_INODE_REF_KEY; 3852 key.type = BTRFS_INODE_REF_KEY;
3627 key.offset = 0; 3853 key.offset = 0;
3628 while (1) { 3854 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3629 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3855 if (ret < 0)
3630 if (ret < 0) 3856 goto out;
3631 goto out;
3632 if (ret)
3633 break;
3634 3857
3858 while (1) {
3635 eb = path->nodes[0]; 3859 eb = path->nodes[0];
3636 slot = path->slots[0]; 3860 slot = path->slots[0];
3861 if (slot >= btrfs_header_nritems(eb)) {
3862 ret = btrfs_next_leaf(root, path);
3863 if (ret < 0)
3864 goto out;
3865 else if (ret > 0)
3866 break;
3867 continue;
3868 }
3869
3637 btrfs_item_key_to_cpu(eb, &found_key, slot); 3870 btrfs_item_key_to_cpu(eb, &found_key, slot);
3638 3871
3639 if (found_key.objectid != key.objectid || 3872 if (found_key.objectid != key.objectid ||
@@ -3642,11 +3875,10 @@ static int process_all_refs(struct send_ctx *sctx,
3642 break; 3875 break;
3643 3876
3644 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 3877 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3645 btrfs_release_path(path);
3646 if (ret < 0) 3878 if (ret < 0)
3647 goto out; 3879 goto out;
3648 3880
3649 key.offset = found_key.offset + 1; 3881 path->slots[0]++;
3650 } 3882 }
3651 btrfs_release_path(path); 3883 btrfs_release_path(path);
3652 3884
@@ -3927,19 +4159,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3927 key.objectid = sctx->cmp_key->objectid; 4159 key.objectid = sctx->cmp_key->objectid;
3928 key.type = BTRFS_XATTR_ITEM_KEY; 4160 key.type = BTRFS_XATTR_ITEM_KEY;
3929 key.offset = 0; 4161 key.offset = 0;
3930 while (1) { 4162 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3931 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 4163 if (ret < 0)
3932 if (ret < 0) 4164 goto out;
3933 goto out;
3934 if (ret) {
3935 ret = 0;
3936 goto out;
3937 }
3938 4165
4166 while (1) {
3939 eb = path->nodes[0]; 4167 eb = path->nodes[0];
3940 slot = path->slots[0]; 4168 slot = path->slots[0];
3941 btrfs_item_key_to_cpu(eb, &found_key, slot); 4169 if (slot >= btrfs_header_nritems(eb)) {
4170 ret = btrfs_next_leaf(root, path);
4171 if (ret < 0) {
4172 goto out;
4173 } else if (ret > 0) {
4174 ret = 0;
4175 break;
4176 }
4177 continue;
4178 }
3942 4179
4180 btrfs_item_key_to_cpu(eb, &found_key, slot);
3943 if (found_key.objectid != key.objectid || 4181 if (found_key.objectid != key.objectid ||
3944 found_key.type != key.type) { 4182 found_key.type != key.type) {
3945 ret = 0; 4183 ret = 0;
@@ -3951,8 +4189,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3951 if (ret < 0) 4189 if (ret < 0)
3952 goto out; 4190 goto out;
3953 4191
3954 btrfs_release_path(path); 4192 path->slots[0]++;
3955 key.offset = found_key.offset + 1;
3956 } 4193 }
3957 4194
3958out: 4195out:
@@ -3991,6 +4228,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
3991 goto out; 4228 goto out;
3992 4229
3993 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; 4230 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
4231
4232 /* initial readahead */
4233 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4234 file_ra_state_init(&sctx->ra, inode->i_mapping);
4235 btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
4236 last_index - index + 1);
4237
3994 while (index <= last_index) { 4238 while (index <= last_index) {
3995 unsigned cur_len = min_t(unsigned, len, 4239 unsigned cur_len = min_t(unsigned, len,
3996 PAGE_CACHE_SIZE - pg_offset); 4240 PAGE_CACHE_SIZE - pg_offset);
@@ -4763,18 +5007,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4763 ret = apply_children_dir_moves(sctx); 5007 ret = apply_children_dir_moves(sctx);
4764 if (ret) 5008 if (ret)
4765 goto out; 5009 goto out;
5010 /*
5011 * Need to send that every time, no matter if it actually
5012 * changed between the two trees as we have done changes to
5013 * the inode before. If our inode is a directory and it's
5014 * waiting to be moved/renamed, we will send its utimes when
5015 * it's moved/renamed, therefore we don't need to do it here.
5016 */
5017 sctx->send_progress = sctx->cur_ino + 1;
5018 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
5019 if (ret < 0)
5020 goto out;
4766 } 5021 }
4767 5022
4768 /*
4769 * Need to send that every time, no matter if it actually
4770 * changed between the two trees as we have done changes to
4771 * the inode before.
4772 */
4773 sctx->send_progress = sctx->cur_ino + 1;
4774 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4775 if (ret < 0)
4776 goto out;
4777
4778out: 5023out:
4779 return ret; 5024 return ret;
4780} 5025}
@@ -4840,6 +5085,8 @@ static int changed_inode(struct send_ctx *sctx,
4840 sctx->left_path->nodes[0], left_ii); 5085 sctx->left_path->nodes[0], left_ii);
4841 sctx->cur_inode_mode = btrfs_inode_mode( 5086 sctx->cur_inode_mode = btrfs_inode_mode(
4842 sctx->left_path->nodes[0], left_ii); 5087 sctx->left_path->nodes[0], left_ii);
5088 sctx->cur_inode_rdev = btrfs_inode_rdev(
5089 sctx->left_path->nodes[0], left_ii);
4843 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 5090 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4844 ret = send_create_inode_if_needed(sctx); 5091 ret = send_create_inode_if_needed(sctx);
4845 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 5092 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5131,8 @@ static int changed_inode(struct send_ctx *sctx,
4884 sctx->left_path->nodes[0], left_ii); 5131 sctx->left_path->nodes[0], left_ii);
4885 sctx->cur_inode_mode = btrfs_inode_mode( 5132 sctx->cur_inode_mode = btrfs_inode_mode(
4886 sctx->left_path->nodes[0], left_ii); 5133 sctx->left_path->nodes[0], left_ii);
5134 sctx->cur_inode_rdev = btrfs_inode_rdev(
5135 sctx->left_path->nodes[0], left_ii);
4887 ret = send_create_inode_if_needed(sctx); 5136 ret = send_create_inode_if_needed(sctx);
4888 if (ret < 0) 5137 if (ret < 0)
4889 goto out; 5138 goto out;
@@ -5118,6 +5367,7 @@ out:
5118static int full_send_tree(struct send_ctx *sctx) 5367static int full_send_tree(struct send_ctx *sctx)
5119{ 5368{
5120 int ret; 5369 int ret;
5370 struct btrfs_trans_handle *trans = NULL;
5121 struct btrfs_root *send_root = sctx->send_root; 5371 struct btrfs_root *send_root = sctx->send_root;
5122 struct btrfs_key key; 5372 struct btrfs_key key;
5123 struct btrfs_key found_key; 5373 struct btrfs_key found_key;
@@ -5139,6 +5389,19 @@ static int full_send_tree(struct send_ctx *sctx)
5139 key.type = BTRFS_INODE_ITEM_KEY; 5389 key.type = BTRFS_INODE_ITEM_KEY;
5140 key.offset = 0; 5390 key.offset = 0;
5141 5391
5392join_trans:
5393 /*
5394 * We need to make sure the transaction does not get committed
5395 * while we do anything on commit roots. Join a transaction to prevent
5396 * this.
5397 */
5398 trans = btrfs_join_transaction(send_root);
5399 if (IS_ERR(trans)) {
5400 ret = PTR_ERR(trans);
5401 trans = NULL;
5402 goto out;
5403 }
5404
5142 /* 5405 /*
5143 * Make sure the tree has not changed after re-joining. We detect this 5406 * Make sure the tree has not changed after re-joining. We detect this
5144 * by comparing start_ctransid and ctransid. They should always match. 5407 * by comparing start_ctransid and ctransid. They should always match.
@@ -5162,6 +5425,19 @@ static int full_send_tree(struct send_ctx *sctx)
5162 goto out_finish; 5425 goto out_finish;
5163 5426
5164 while (1) { 5427 while (1) {
5428 /*
5429 * When someone want to commit while we iterate, end the
5430 * joined transaction and rejoin.
5431 */
5432 if (btrfs_should_end_transaction(trans, send_root)) {
5433 ret = btrfs_end_transaction(trans, send_root);
5434 trans = NULL;
5435 if (ret < 0)
5436 goto out;
5437 btrfs_release_path(path);
5438 goto join_trans;
5439 }
5440
5165 eb = path->nodes[0]; 5441 eb = path->nodes[0];
5166 slot = path->slots[0]; 5442 slot = path->slots[0];
5167 btrfs_item_key_to_cpu(eb, &found_key, slot); 5443 btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5189,6 +5465,12 @@ out_finish:
5189 5465
5190out: 5466out:
5191 btrfs_free_path(path); 5467 btrfs_free_path(path);
5468 if (trans) {
5469 if (!ret)
5470 ret = btrfs_end_transaction(trans, send_root);
5471 else
5472 btrfs_end_transaction(trans, send_root);
5473 }
5192 return ret; 5474 return ret;
5193} 5475}
5194 5476
@@ -5340,6 +5622,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5340 5622
5341 sctx->pending_dir_moves = RB_ROOT; 5623 sctx->pending_dir_moves = RB_ROOT;
5342 sctx->waiting_dir_moves = RB_ROOT; 5624 sctx->waiting_dir_moves = RB_ROOT;
5625 sctx->orphan_dirs = RB_ROOT;
5343 5626
5344 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 5627 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
5345 (arg->clone_sources_count + 1)); 5628 (arg->clone_sources_count + 1));
@@ -5477,6 +5760,16 @@ out:
5477 kfree(dm); 5760 kfree(dm);
5478 } 5761 }
5479 5762
5763 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
5764 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
5765 struct rb_node *n;
5766 struct orphan_dir_info *odi;
5767
5768 n = rb_first(&sctx->orphan_dirs);
5769 odi = rb_entry(n, struct orphan_dir_info, node);
5770 free_orphan_dir_info(sctx, odi);
5771 }
5772
5480 if (sort_clone_roots) { 5773 if (sort_clone_roots) {
5481 for (i = 0; i < sctx->clone_roots_cnt; i++) 5774 for (i = 0; i < sctx->clone_roots_cnt; i++)
5482 btrfs_root_dec_send_in_progress( 5775 btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..9dbf42395153 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1305,13 +1305,6 @@ error_fs_info:
1305 return ERR_PTR(error); 1305 return ERR_PTR(error);
1306} 1306}
1307 1307
1308static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1309{
1310 spin_lock_irq(&workers->lock);
1311 workers->max_workers = new_limit;
1312 spin_unlock_irq(&workers->lock);
1313}
1314
1315static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, 1308static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1316 int new_pool_size, int old_pool_size) 1309 int new_pool_size, int old_pool_size)
1317{ 1310{
@@ -1323,21 +1316,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1323 btrfs_info(fs_info, "resize thread pool %d -> %d", 1316 btrfs_info(fs_info, "resize thread pool %d -> %d",
1324 old_pool_size, new_pool_size); 1317 old_pool_size, new_pool_size);
1325 1318
1326 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1319 btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1327 btrfs_set_max_workers(&fs_info->workers, new_pool_size); 1320 btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1328 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); 1321 btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
1329 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); 1322 btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1330 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); 1323 btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
1331 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); 1324 btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
1332 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); 1325 btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
1333 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); 1326 new_pool_size);
1334 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); 1327 btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
1335 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); 1328 btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1336 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1329 btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1337 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1330 btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1338 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1331 btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
1339 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1332 new_pool_size);
1340 new_pool_size);
1341} 1333}
1342 1334
1343static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) 1335static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1388,6 +1380,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1388 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1380 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1389 int ret; 1381 int ret;
1390 1382
1383 sync_filesystem(sb);
1391 btrfs_remount_prepare(fs_info); 1384 btrfs_remount_prepare(fs_info);
1392 1385
1393 ret = btrfs_parse_options(root, data); 1386 ret = btrfs_parse_options(root, data);
@@ -1479,6 +1472,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1479 sb->s_flags &= ~MS_RDONLY; 1472 sb->s_flags &= ~MS_RDONLY;
1480 } 1473 }
1481out: 1474out:
1475 wake_up_process(fs_info->transaction_kthread);
1482 btrfs_remount_cleanup(fs_info, old_opts); 1476 btrfs_remount_cleanup(fs_info, old_opts);
1483 return 0; 1477 return 0;
1484 1478
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
24#include <linux/kobject.h> 24#include <linux/kobject.h>
25#include <linux/bug.h> 25#include <linux/bug.h>
26#include <linux/genhd.h> 26#include <linux/genhd.h>
27#include <linux/debugfs.h>
27 28
28#include "ctree.h" 29#include "ctree.h"
29#include "disk-io.h" 30#include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
599/* /sys/fs/btrfs/ entry */ 600/* /sys/fs/btrfs/ entry */
600static struct kset *btrfs_kset; 601static struct kset *btrfs_kset;
601 602
603/* /sys/kernel/debug/btrfs */
604static struct dentry *btrfs_debugfs_root_dentry;
605
606/* Debugging tunables and exported data */
607u64 btrfs_debugfs_test;
608
602int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 609int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
603{ 610{
604 int error; 611 int error;
@@ -642,27 +649,41 @@ failure:
642 return error; 649 return error;
643} 650}
644 651
652static int btrfs_init_debugfs(void)
653{
654#ifdef CONFIG_DEBUG_FS
655 btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
656 if (!btrfs_debugfs_root_dentry)
657 return -ENOMEM;
658
659 debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
660 &btrfs_debugfs_test);
661#endif
662 return 0;
663}
664
645int btrfs_init_sysfs(void) 665int btrfs_init_sysfs(void)
646{ 666{
647 int ret; 667 int ret;
668
648 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 669 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
649 if (!btrfs_kset) 670 if (!btrfs_kset)
650 return -ENOMEM; 671 return -ENOMEM;
651 672
652 init_feature_attrs(); 673 ret = btrfs_init_debugfs();
674 if (ret)
675 return ret;
653 676
677 init_feature_attrs();
654 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 678 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
655 if (ret) {
656 kset_unregister(btrfs_kset);
657 return ret;
658 }
659 679
660 return 0; 680 return ret;
661} 681}
662 682
663void btrfs_exit_sysfs(void) 683void btrfs_exit_sysfs(void)
664{ 684{
665 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 685 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
666 kset_unregister(btrfs_kset); 686 kset_unregister(btrfs_kset);
687 debugfs_remove_recursive(btrfs_debugfs_root_dentry);
667} 688}
668 689
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
1#ifndef _BTRFS_SYSFS_H_ 1#ifndef _BTRFS_SYSFS_H_
2#define _BTRFS_SYSFS_H_ 2#define _BTRFS_SYSFS_H_
3 3
4/*
5 * Data exported through sysfs
6 */
7extern u64 btrfs_debugfs_test;
8
4enum btrfs_feature_set { 9enum btrfs_feature_set {
5 FEAT_COMPAT, 10 FEAT_COMPAT,
6 FEAT_COMPAT_RO, 11 FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..a04707f740d6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
683 int lock = (trans->type != TRANS_JOIN_NOLOCK); 683 int lock = (trans->type != TRANS_JOIN_NOLOCK);
684 int err = 0; 684 int err = 0;
685 685
686 if (--trans->use_count) { 686 if (trans->use_count > 1) {
687 trans->use_count--;
687 trans->block_rsv = trans->orig_rsv; 688 trans->block_rsv = trans->orig_rsv;
688 return 0; 689 return 0;
689 } 690 }
@@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
731 } 732 }
732 733
733 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { 734 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
734 if (throttle) { 735 if (throttle)
735 /*
736 * We may race with somebody else here so end up having
737 * to call end_transaction on ourselves again, so inc
738 * our use_count.
739 */
740 trans->use_count++;
741 return btrfs_commit_transaction(trans, root); 736 return btrfs_commit_transaction(trans, root);
742 } else { 737 else
743 wake_up_process(info->transaction_kthread); 738 wake_up_process(info->transaction_kthread);
744 }
745 } 739 }
746 740
747 if (trans->type & __TRANS_FREEZABLE) 741 if (trans->type & __TRANS_FREEZABLE)
@@ -1578,10 +1572,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1578 1572
1579 trace_btrfs_transaction_commit(root); 1573 trace_btrfs_transaction_commit(root);
1580 1574
1581 btrfs_scrub_continue(root);
1582
1583 if (current->journal_info == trans) 1575 if (current->journal_info == trans)
1584 current->journal_info = NULL; 1576 current->journal_info = NULL;
1577 btrfs_scrub_cancel(root->fs_info);
1585 1578
1586 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1579 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1587} 1580}
@@ -1621,7 +1614,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1621static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1614static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1622{ 1615{
1623 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1616 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1624 return btrfs_start_delalloc_roots(fs_info, 1); 1617 return btrfs_start_delalloc_roots(fs_info, 1, -1);
1625 return 0; 1618 return 0;
1626} 1619}
1627 1620
@@ -1754,7 +1747,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 /* ->aborted might be set after the previous check, so check it */ 1747 /* ->aborted might be set after the previous check, so check it */
1755 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1748 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1756 ret = cur_trans->aborted; 1749 ret = cur_trans->aborted;
1757 goto cleanup_transaction; 1750 goto scrub_continue;
1758 } 1751 }
1759 /* 1752 /*
1760 * the reloc mutex makes sure that we stop 1753 * the reloc mutex makes sure that we stop
@@ -1771,7 +1764,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771 ret = create_pending_snapshots(trans, root->fs_info); 1764 ret = create_pending_snapshots(trans, root->fs_info);
1772 if (ret) { 1765 if (ret) {
1773 mutex_unlock(&root->fs_info->reloc_mutex); 1766 mutex_unlock(&root->fs_info->reloc_mutex);
1774 goto cleanup_transaction; 1767 goto scrub_continue;
1775 } 1768 }
1776 1769
1777 /* 1770 /*
@@ -1787,13 +1780,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1787 ret = btrfs_run_delayed_items(trans, root); 1780 ret = btrfs_run_delayed_items(trans, root);
1788 if (ret) { 1781 if (ret) {
1789 mutex_unlock(&root->fs_info->reloc_mutex); 1782 mutex_unlock(&root->fs_info->reloc_mutex);
1790 goto cleanup_transaction; 1783 goto scrub_continue;
1791 } 1784 }
1792 1785
1793 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1786 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1794 if (ret) { 1787 if (ret) {
1795 mutex_unlock(&root->fs_info->reloc_mutex); 1788 mutex_unlock(&root->fs_info->reloc_mutex);
1796 goto cleanup_transaction; 1789 goto scrub_continue;
1797 } 1790 }
1798 1791
1799 /* 1792 /*
@@ -1823,7 +1816,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1823 if (ret) { 1816 if (ret) {
1824 mutex_unlock(&root->fs_info->tree_log_mutex); 1817 mutex_unlock(&root->fs_info->tree_log_mutex);
1825 mutex_unlock(&root->fs_info->reloc_mutex); 1818 mutex_unlock(&root->fs_info->reloc_mutex);
1826 goto cleanup_transaction; 1819 goto scrub_continue;
1827 } 1820 }
1828 1821
1829 /* 1822 /*
@@ -1844,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1844 if (ret) { 1837 if (ret) {
1845 mutex_unlock(&root->fs_info->tree_log_mutex); 1838 mutex_unlock(&root->fs_info->tree_log_mutex);
1846 mutex_unlock(&root->fs_info->reloc_mutex); 1839 mutex_unlock(&root->fs_info->reloc_mutex);
1847 goto cleanup_transaction; 1840 goto scrub_continue;
1848 } 1841 }
1849 1842
1850 /* 1843 /*
@@ -1855,7 +1848,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1855 ret = cur_trans->aborted; 1848 ret = cur_trans->aborted;
1856 mutex_unlock(&root->fs_info->tree_log_mutex); 1849 mutex_unlock(&root->fs_info->tree_log_mutex);
1857 mutex_unlock(&root->fs_info->reloc_mutex); 1850 mutex_unlock(&root->fs_info->reloc_mutex);
1858 goto cleanup_transaction; 1851 goto scrub_continue;
1859 } 1852 }
1860 1853
1861 btrfs_prepare_extent_commit(trans, root); 1854 btrfs_prepare_extent_commit(trans, root);
@@ -1891,13 +1884,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1891 btrfs_error(root->fs_info, ret, 1884 btrfs_error(root->fs_info, ret,
1892 "Error while writing out transaction"); 1885 "Error while writing out transaction");
1893 mutex_unlock(&root->fs_info->tree_log_mutex); 1886 mutex_unlock(&root->fs_info->tree_log_mutex);
1894 goto cleanup_transaction; 1887 goto scrub_continue;
1895 } 1888 }
1896 1889
1897 ret = write_ctree_super(trans, root, 0); 1890 ret = write_ctree_super(trans, root, 0);
1898 if (ret) { 1891 if (ret) {
1899 mutex_unlock(&root->fs_info->tree_log_mutex); 1892 mutex_unlock(&root->fs_info->tree_log_mutex);
1900 goto cleanup_transaction; 1893 goto scrub_continue;
1901 } 1894 }
1902 1895
1903 /* 1896 /*
@@ -1940,6 +1933,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1940 1933
1941 return ret; 1934 return ret;
1942 1935
1936scrub_continue:
1937 btrfs_scrub_continue(root);
1943cleanup_transaction: 1938cleanup_transaction:
1944 btrfs_trans_release_metadata(trans, root); 1939 btrfs_trans_release_metadata(trans, root);
1945 trans->block_rsv = NULL; 1940 trans->block_rsv = NULL;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
136 * syncing the tree wait for us to finish 136 * syncing the tree wait for us to finish
137 */ 137 */
138static int start_log_trans(struct btrfs_trans_handle *trans, 138static int start_log_trans(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root) 139 struct btrfs_root *root,
140 struct btrfs_log_ctx *ctx)
140{ 141{
142 int index;
141 int ret; 143 int ret;
142 int err = 0;
143 144
144 mutex_lock(&root->log_mutex); 145 mutex_lock(&root->log_mutex);
145 if (root->log_root) { 146 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
148 trans->transid) {
149 ret = -EAGAIN;
150 goto out;
151 }
152
146 if (!root->log_start_pid) { 153 if (!root->log_start_pid) {
147 root->log_start_pid = current->pid; 154 root->log_start_pid = current->pid;
148 root->log_multiple_pids = false; 155 root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
152 159
153 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_batch);
154 atomic_inc(&root->log_writers); 161 atomic_inc(&root->log_writers);
162 if (ctx) {
163 index = root->log_transid % 2;
164 list_add_tail(&ctx->list, &root->log_ctxs[index]);
165 ctx->log_transid = root->log_transid;
166 }
155 mutex_unlock(&root->log_mutex); 167 mutex_unlock(&root->log_mutex);
156 return 0; 168 return 0;
157 } 169 }
158 root->log_multiple_pids = false; 170
159 root->log_start_pid = current->pid; 171 ret = 0;
160 mutex_lock(&root->fs_info->tree_log_mutex); 172 mutex_lock(&root->fs_info->tree_log_mutex);
161 if (!root->fs_info->log_root_tree) { 173 if (!root->fs_info->log_root_tree)
162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 ret = btrfs_init_log_root_tree(trans, root->fs_info);
163 if (ret) 175 mutex_unlock(&root->fs_info->tree_log_mutex);
164 err = ret; 176 if (ret)
165 } 177 goto out;
166 if (err == 0 && !root->log_root) { 178
179 if (!root->log_root) {
167 ret = btrfs_add_log_tree(trans, root); 180 ret = btrfs_add_log_tree(trans, root);
168 if (ret) 181 if (ret)
169 err = ret; 182 goto out;
170 } 183 }
171 mutex_unlock(&root->fs_info->tree_log_mutex); 184 root->log_multiple_pids = false;
185 root->log_start_pid = current->pid;
172 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_batch);
173 atomic_inc(&root->log_writers); 187 atomic_inc(&root->log_writers);
188 if (ctx) {
189 index = root->log_transid % 2;
190 list_add_tail(&ctx->list, &root->log_ctxs[index]);
191 ctx->log_transid = root->log_transid;
192 }
193out:
174 mutex_unlock(&root->log_mutex); 194 mutex_unlock(&root->log_mutex);
175 return err; 195 return ret;
176} 196}
177 197
178/* 198/*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
2359 return ret; 2379 return ret;
2360} 2380}
2361 2381
2362static int wait_log_commit(struct btrfs_trans_handle *trans, 2382static void wait_log_commit(struct btrfs_trans_handle *trans,
2363 struct btrfs_root *root, unsigned long transid) 2383 struct btrfs_root *root, int transid)
2364{ 2384{
2365 DEFINE_WAIT(wait); 2385 DEFINE_WAIT(wait);
2366 int index = transid % 2; 2386 int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
2375 &wait, TASK_UNINTERRUPTIBLE); 2395 &wait, TASK_UNINTERRUPTIBLE);
2376 mutex_unlock(&root->log_mutex); 2396 mutex_unlock(&root->log_mutex);
2377 2397
2378 if (root->fs_info->last_trans_log_full_commit != 2398 if (root->log_transid_committed < transid &&
2379 trans->transid && root->log_transid < transid + 2 &&
2380 atomic_read(&root->log_commit[index])) 2399 atomic_read(&root->log_commit[index]))
2381 schedule(); 2400 schedule();
2382 2401
2383 finish_wait(&root->log_commit_wait[index], &wait); 2402 finish_wait(&root->log_commit_wait[index], &wait);
2384 mutex_lock(&root->log_mutex); 2403 mutex_lock(&root->log_mutex);
2385 } while (root->fs_info->last_trans_log_full_commit != 2404 } while (root->log_transid_committed < transid &&
2386 trans->transid && root->log_transid < transid + 2 &&
2387 atomic_read(&root->log_commit[index])); 2405 atomic_read(&root->log_commit[index]));
2388 return 0;
2389} 2406}
2390 2407
2391static void wait_for_writer(struct btrfs_trans_handle *trans, 2408static void wait_for_writer(struct btrfs_trans_handle *trans,
2392 struct btrfs_root *root) 2409 struct btrfs_root *root)
2393{ 2410{
2394 DEFINE_WAIT(wait); 2411 DEFINE_WAIT(wait);
2395 while (root->fs_info->last_trans_log_full_commit != 2412
2396 trans->transid && atomic_read(&root->log_writers)) { 2413 while (atomic_read(&root->log_writers)) {
2397 prepare_to_wait(&root->log_writer_wait, 2414 prepare_to_wait(&root->log_writer_wait,
2398 &wait, TASK_UNINTERRUPTIBLE); 2415 &wait, TASK_UNINTERRUPTIBLE);
2399 mutex_unlock(&root->log_mutex); 2416 mutex_unlock(&root->log_mutex);
2400 if (root->fs_info->last_trans_log_full_commit != 2417 if (atomic_read(&root->log_writers))
2401 trans->transid && atomic_read(&root->log_writers))
2402 schedule(); 2418 schedule();
2403 mutex_lock(&root->log_mutex); 2419 mutex_lock(&root->log_mutex);
2404 finish_wait(&root->log_writer_wait, &wait); 2420 finish_wait(&root->log_writer_wait, &wait);
2405 } 2421 }
2406} 2422}
2407 2423
2424static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2425 struct btrfs_log_ctx *ctx)
2426{
2427 if (!ctx)
2428 return;
2429
2430 mutex_lock(&root->log_mutex);
2431 list_del_init(&ctx->list);
2432 mutex_unlock(&root->log_mutex);
2433}
2434
2435/*
2436 * Invoked in log mutex context, or be sure there is no other task which
2437 * can access the list.
2438 */
2439static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2440 int index, int error)
2441{
2442 struct btrfs_log_ctx *ctx;
2443
2444 if (!error) {
2445 INIT_LIST_HEAD(&root->log_ctxs[index]);
2446 return;
2447 }
2448
2449 list_for_each_entry(ctx, &root->log_ctxs[index], list)
2450 ctx->log_ret = error;
2451
2452 INIT_LIST_HEAD(&root->log_ctxs[index]);
2453}
2454
2408/* 2455/*
2409 * btrfs_sync_log does sends a given tree log down to the disk and 2456 * btrfs_sync_log does sends a given tree log down to the disk and
2410 * updates the super blocks to record it. When this call is done, 2457 * updates the super blocks to record it. When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
2418 * that has happened. 2465 * that has happened.
2419 */ 2466 */
2420int btrfs_sync_log(struct btrfs_trans_handle *trans, 2467int btrfs_sync_log(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root) 2468 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2422{ 2469{
2423 int index1; 2470 int index1;
2424 int index2; 2471 int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2426 int ret; 2473 int ret;
2427 struct btrfs_root *log = root->log_root; 2474 struct btrfs_root *log = root->log_root;
2428 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2475 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2429 unsigned long log_transid = 0; 2476 int log_transid = 0;
2477 struct btrfs_log_ctx root_log_ctx;
2430 struct blk_plug plug; 2478 struct blk_plug plug;
2431 2479
2432 mutex_lock(&root->log_mutex); 2480 mutex_lock(&root->log_mutex);
2433 log_transid = root->log_transid; 2481 log_transid = ctx->log_transid;
2434 index1 = root->log_transid % 2; 2482 if (root->log_transid_committed >= log_transid) {
2483 mutex_unlock(&root->log_mutex);
2484 return ctx->log_ret;
2485 }
2486
2487 index1 = log_transid % 2;
2435 if (atomic_read(&root->log_commit[index1])) { 2488 if (atomic_read(&root->log_commit[index1])) {
2436 wait_log_commit(trans, root, root->log_transid); 2489 wait_log_commit(trans, root, log_transid);
2437 mutex_unlock(&root->log_mutex); 2490 mutex_unlock(&root->log_mutex);
2438 return 0; 2491 return ctx->log_ret;
2439 } 2492 }
2493 ASSERT(log_transid == root->log_transid);
2440 atomic_set(&root->log_commit[index1], 1); 2494 atomic_set(&root->log_commit[index1], 1);
2441 2495
2442 /* wait for previous tree log sync to complete */ 2496 /* wait for previous tree log sync to complete */
2443 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2497 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2444 wait_log_commit(trans, root, root->log_transid - 1); 2498 wait_log_commit(trans, root, log_transid - 1);
2499
2445 while (1) { 2500 while (1) {
2446 int batch = atomic_read(&root->log_batch); 2501 int batch = atomic_read(&root->log_batch);
2447 /* when we're on an ssd, just kick the log commit out */ 2502 /* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2456 } 2511 }
2457 2512
2458 /* bail out if we need to do a full commit */ 2513 /* bail out if we need to do a full commit */
2459 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2515 trans->transid) {
2460 ret = -EAGAIN; 2516 ret = -EAGAIN;
2461 btrfs_free_logged_extents(log, log_transid); 2517 btrfs_free_logged_extents(log, log_transid);
2462 mutex_unlock(&root->log_mutex); 2518 mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2477 blk_finish_plug(&plug); 2533 blk_finish_plug(&plug);
2478 btrfs_abort_transaction(trans, root, ret); 2534 btrfs_abort_transaction(trans, root, ret);
2479 btrfs_free_logged_extents(log, log_transid); 2535 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2537 trans->transid;
2480 mutex_unlock(&root->log_mutex); 2538 mutex_unlock(&root->log_mutex);
2481 goto out; 2539 goto out;
2482 } 2540 }
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2486 root->log_transid++; 2544 root->log_transid++;
2487 log->log_transid = root->log_transid; 2545 log->log_transid = root->log_transid;
2488 root->log_start_pid = 0; 2546 root->log_start_pid = 0;
2489 smp_mb();
2490 /* 2547 /*
2491 * IO has been started, blocks of the log tree have WRITTEN flag set 2548 * IO has been started, blocks of the log tree have WRITTEN flag set
2492 * in their headers. new modifications of the log will be written to 2549 * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2494 */ 2551 */
2495 mutex_unlock(&root->log_mutex); 2552 mutex_unlock(&root->log_mutex);
2496 2553
2554 btrfs_init_log_ctx(&root_log_ctx);
2555
2497 mutex_lock(&log_root_tree->log_mutex); 2556 mutex_lock(&log_root_tree->log_mutex);
2498 atomic_inc(&log_root_tree->log_batch); 2557 atomic_inc(&log_root_tree->log_batch);
2499 atomic_inc(&log_root_tree->log_writers); 2558 atomic_inc(&log_root_tree->log_writers);
2559
2560 index2 = log_root_tree->log_transid % 2;
2561 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2562 root_log_ctx.log_transid = log_root_tree->log_transid;
2563
2500 mutex_unlock(&log_root_tree->log_mutex); 2564 mutex_unlock(&log_root_tree->log_mutex);
2501 2565
2502 ret = update_log_root(trans, log); 2566 ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2509 } 2573 }
2510 2574
2511 if (ret) { 2575 if (ret) {
2576 if (!list_empty(&root_log_ctx.list))
2577 list_del_init(&root_log_ctx.list);
2578
2512 blk_finish_plug(&plug); 2579 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2581 trans->transid;
2513 if (ret != -ENOSPC) { 2582 if (ret != -ENOSPC) {
2514 btrfs_abort_transaction(trans, root, ret); 2583 btrfs_abort_transaction(trans, root, ret);
2515 mutex_unlock(&log_root_tree->log_mutex); 2584 mutex_unlock(&log_root_tree->log_mutex);
2516 goto out; 2585 goto out;
2517 } 2586 }
2518 root->fs_info->last_trans_log_full_commit = trans->transid;
2519 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2587 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2520 btrfs_free_logged_extents(log, log_transid); 2588 btrfs_free_logged_extents(log, log_transid);
2521 mutex_unlock(&log_root_tree->log_mutex); 2589 mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2523 goto out; 2591 goto out;
2524 } 2592 }
2525 2593
2526 index2 = log_root_tree->log_transid % 2; 2594 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2595 mutex_unlock(&log_root_tree->log_mutex);
2596 ret = root_log_ctx.log_ret;
2597 goto out;
2598 }
2599
2600 index2 = root_log_ctx.log_transid % 2;
2527 if (atomic_read(&log_root_tree->log_commit[index2])) { 2601 if (atomic_read(&log_root_tree->log_commit[index2])) {
2528 blk_finish_plug(&plug); 2602 blk_finish_plug(&plug);
2529 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2603 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2530 wait_log_commit(trans, log_root_tree, 2604 wait_log_commit(trans, log_root_tree,
2531 log_root_tree->log_transid); 2605 root_log_ctx.log_transid);
2532 btrfs_free_logged_extents(log, log_transid); 2606 btrfs_free_logged_extents(log, log_transid);
2533 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2534 ret = 0; 2608 ret = root_log_ctx.log_ret;
2535 goto out; 2609 goto out;
2536 } 2610 }
2611 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2537 atomic_set(&log_root_tree->log_commit[index2], 1); 2612 atomic_set(&log_root_tree->log_commit[index2], 1);
2538 2613
2539 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2614 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2540 wait_log_commit(trans, log_root_tree, 2615 wait_log_commit(trans, log_root_tree,
2541 log_root_tree->log_transid - 1); 2616 root_log_ctx.log_transid - 1);
2542 } 2617 }
2543 2618
2544 wait_for_writer(trans, log_root_tree); 2619 wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2547 * now that we've moved on to the tree of log tree roots, 2622 * now that we've moved on to the tree of log tree roots,
2548 * check the full commit flag again 2623 * check the full commit flag again
2549 */ 2624 */
2550 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2626 trans->transid) {
2551 blk_finish_plug(&plug); 2627 blk_finish_plug(&plug);
2552 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2553 btrfs_free_logged_extents(log, log_transid); 2629 btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2561 EXTENT_DIRTY | EXTENT_NEW); 2637 EXTENT_DIRTY | EXTENT_NEW);
2562 blk_finish_plug(&plug); 2638 blk_finish_plug(&plug);
2563 if (ret) { 2639 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2641 trans->transid;
2564 btrfs_abort_transaction(trans, root, ret); 2642 btrfs_abort_transaction(trans, root, ret);
2565 btrfs_free_logged_extents(log, log_transid); 2643 btrfs_free_logged_extents(log, log_transid);
2566 mutex_unlock(&log_root_tree->log_mutex); 2644 mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2578 btrfs_header_level(log_root_tree->node)); 2656 btrfs_header_level(log_root_tree->node));
2579 2657
2580 log_root_tree->log_transid++; 2658 log_root_tree->log_transid++;
2581 smp_mb();
2582
2583 mutex_unlock(&log_root_tree->log_mutex); 2659 mutex_unlock(&log_root_tree->log_mutex);
2584 2660
2585 /* 2661 /*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2591 */ 2667 */
2592 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2593 if (ret) { 2669 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2671 trans->transid;
2594 btrfs_abort_transaction(trans, root, ret); 2672 btrfs_abort_transaction(trans, root, ret);
2595 goto out_wake_log_root; 2673 goto out_wake_log_root;
2596 } 2674 }
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2601 mutex_unlock(&root->log_mutex); 2679 mutex_unlock(&root->log_mutex);
2602 2680
2603out_wake_log_root: 2681out_wake_log_root:
2682 /*
2683 * We needn't get log_mutex here because we are sure all
2684 * the other tasks are blocked.
2685 */
2686 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2687
2688 mutex_lock(&log_root_tree->log_mutex);
2689 log_root_tree->log_transid_committed++;
2604 atomic_set(&log_root_tree->log_commit[index2], 0); 2690 atomic_set(&log_root_tree->log_commit[index2], 0);
2605 smp_mb(); 2691 mutex_unlock(&log_root_tree->log_mutex);
2692
2606 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2693 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2607 wake_up(&log_root_tree->log_commit_wait[index2]); 2694 wake_up(&log_root_tree->log_commit_wait[index2]);
2608out: 2695out:
2696 /* See above. */
2697 btrfs_remove_all_log_ctxs(root, index1, ret);
2698
2699 mutex_lock(&root->log_mutex);
2700 root->log_transid_committed++;
2609 atomic_set(&root->log_commit[index1], 0); 2701 atomic_set(&root->log_commit[index1], 0);
2610 smp_mb(); 2702 mutex_unlock(&root->log_mutex);
2703
2611 if (waitqueue_active(&root->log_commit_wait[index1])) 2704 if (waitqueue_active(&root->log_commit_wait[index1]))
2612 wake_up(&root->log_commit_wait[index1]); 2705 wake_up(&root->log_commit_wait[index1]);
2613 return ret; 2706 return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3479 3572
3480static int log_one_extent(struct btrfs_trans_handle *trans, 3573static int log_one_extent(struct btrfs_trans_handle *trans,
3481 struct inode *inode, struct btrfs_root *root, 3574 struct inode *inode, struct btrfs_root *root,
3482 struct extent_map *em, struct btrfs_path *path) 3575 struct extent_map *em, struct btrfs_path *path,
3576 struct list_head *logged_list)
3483{ 3577{
3484 struct btrfs_root *log = root->log_root; 3578 struct btrfs_root *log = root->log_root;
3485 struct btrfs_file_extent_item *fi; 3579 struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3495 u64 extent_offset = em->start - em->orig_start; 3589 u64 extent_offset = em->start - em->orig_start;
3496 u64 block_len; 3590 u64 block_len;
3497 int ret; 3591 int ret;
3498 int index = log->log_transid % 2;
3499 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3592 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3500 int extent_inserted = 0; 3593 int extent_inserted = 0;
3501 3594
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3579 * First check and see if our csums are on our outstanding ordered 3672 * First check and see if our csums are on our outstanding ordered
3580 * extents. 3673 * extents.
3581 */ 3674 */
3582again: 3675 list_for_each_entry(ordered, logged_list, log_list) {
3583 spin_lock_irq(&log->log_extents_lock[index]);
3584 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3585 struct btrfs_ordered_sum *sum; 3676 struct btrfs_ordered_sum *sum;
3586 3677
3587 if (!mod_len) 3678 if (!mod_len)
3588 break; 3679 break;
3589 3680
3590 if (ordered->inode != inode)
3591 continue;
3592
3593 if (ordered->file_offset + ordered->len <= mod_start || 3681 if (ordered->file_offset + ordered->len <= mod_start ||
3594 mod_start + mod_len <= ordered->file_offset) 3682 mod_start + mod_len <= ordered->file_offset)
3595 continue; 3683 continue;
@@ -3632,12 +3720,6 @@ again:
3632 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3720 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3633 &ordered->flags)) 3721 &ordered->flags))
3634 continue; 3722 continue;
3635 atomic_inc(&ordered->refs);
3636 spin_unlock_irq(&log->log_extents_lock[index]);
3637 /*
3638 * we've dropped the lock, we must either break or
3639 * start over after this.
3640 */
3641 3723
3642 if (ordered->csum_bytes_left) { 3724 if (ordered->csum_bytes_left) {
3643 btrfs_start_ordered_extent(inode, ordered, 0); 3725 btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
3647 3729
3648 list_for_each_entry(sum, &ordered->list, list) { 3730 list_for_each_entry(sum, &ordered->list, list) {
3649 ret = btrfs_csum_file_blocks(trans, log, sum); 3731 ret = btrfs_csum_file_blocks(trans, log, sum);
3650 if (ret) { 3732 if (ret)
3651 btrfs_put_ordered_extent(ordered);
3652 goto unlocked; 3733 goto unlocked;
3653 }
3654 } 3734 }
3655 btrfs_put_ordered_extent(ordered);
3656 goto again;
3657 3735
3658 } 3736 }
3659 spin_unlock_irq(&log->log_extents_lock[index]);
3660unlocked: 3737unlocked:
3661 3738
3662 if (!mod_len || ret) 3739 if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
3694static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3771static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3695 struct btrfs_root *root, 3772 struct btrfs_root *root,
3696 struct inode *inode, 3773 struct inode *inode,
3697 struct btrfs_path *path) 3774 struct btrfs_path *path,
3775 struct list_head *logged_list)
3698{ 3776{
3699 struct extent_map *em, *n; 3777 struct extent_map *em, *n;
3700 struct list_head extents; 3778 struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
3752 3830
3753 write_unlock(&tree->lock); 3831 write_unlock(&tree->lock);
3754 3832
3755 ret = log_one_extent(trans, inode, root, em, path); 3833 ret = log_one_extent(trans, inode, root, em, path, logged_list);
3756 write_lock(&tree->lock); 3834 write_lock(&tree->lock);
3757 clear_em_logging(tree, em); 3835 clear_em_logging(tree, em);
3758 free_extent_map(em); 3836 free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3788 struct btrfs_key max_key; 3866 struct btrfs_key max_key;
3789 struct btrfs_root *log = root->log_root; 3867 struct btrfs_root *log = root->log_root;
3790 struct extent_buffer *src = NULL; 3868 struct extent_buffer *src = NULL;
3869 LIST_HEAD(logged_list);
3791 u64 last_extent = 0; 3870 u64 last_extent = 0;
3792 int err = 0; 3871 int err = 0;
3793 int ret; 3872 int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3836 3915
3837 mutex_lock(&BTRFS_I(inode)->log_mutex); 3916 mutex_lock(&BTRFS_I(inode)->log_mutex);
3838 3917
3839 btrfs_get_logged_extents(log, inode); 3918 btrfs_get_logged_extents(inode, &logged_list);
3840 3919
3841 /* 3920 /*
3842 * a brute force approach to making sure we get the most uptodate 3921 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
3962 btrfs_release_path(path); 4041 btrfs_release_path(path);
3963 btrfs_release_path(dst_path); 4042 btrfs_release_path(dst_path);
3964 if (fast_search) { 4043 if (fast_search) {
3965 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 4044 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4045 &logged_list);
3966 if (ret) { 4046 if (ret) {
3967 err = ret; 4047 err = ret;
3968 goto out_unlock; 4048 goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
3987 BTRFS_I(inode)->logged_trans = trans->transid; 4067 BTRFS_I(inode)->logged_trans = trans->transid;
3988 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4068 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3989out_unlock: 4069out_unlock:
3990 if (err) 4070 if (unlikely(err))
3991 btrfs_free_logged_extents(log, log->log_transid); 4071 btrfs_put_logged_extents(&logged_list);
4072 else
4073 btrfs_submit_logged_extents(&logged_list, log);
3992 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4074 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3993 4075
3994 btrfs_free_path(path); 4076 btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
4079 */ 4161 */
4080static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4162static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4081 struct btrfs_root *root, struct inode *inode, 4163 struct btrfs_root *root, struct inode *inode,
4082 struct dentry *parent, int exists_only) 4164 struct dentry *parent, int exists_only,
4165 struct btrfs_log_ctx *ctx)
4083{ 4166{
4084 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4167 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4085 struct super_block *sb; 4168 struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4116 goto end_no_trans; 4199 goto end_no_trans;
4117 } 4200 }
4118 4201
4119 ret = start_log_trans(trans, root); 4202 ret = start_log_trans(trans, root, ctx);
4120 if (ret) 4203 if (ret)
4121 goto end_trans; 4204 goto end_no_trans;
4122 4205
4123 ret = btrfs_log_inode(trans, root, inode, inode_only); 4206 ret = btrfs_log_inode(trans, root, inode, inode_only);
4124 if (ret) 4207 if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
4166 root->fs_info->last_trans_log_full_commit = trans->transid; 4249 root->fs_info->last_trans_log_full_commit = trans->transid;
4167 ret = 1; 4250 ret = 1;
4168 } 4251 }
4252
4253 if (ret)
4254 btrfs_remove_log_ctx(root, ctx);
4169 btrfs_end_log_trans(root); 4255 btrfs_end_log_trans(root);
4170end_no_trans: 4256end_no_trans:
4171 return ret; 4257 return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
4178 * data on disk. 4264 * data on disk.
4179 */ 4265 */
4180int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4266int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4181 struct btrfs_root *root, struct dentry *dentry) 4267 struct btrfs_root *root, struct dentry *dentry,
4268 struct btrfs_log_ctx *ctx)
4182{ 4269{
4183 struct dentry *parent = dget_parent(dentry); 4270 struct dentry *parent = dget_parent(dentry);
4184 int ret; 4271 int ret;
4185 4272
4186 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4273 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4274 0, ctx);
4187 dput(parent); 4275 dput(parent);
4188 4276
4189 return ret; 4277 return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4420 root->fs_info->last_trans_committed)) 4508 root->fs_info->last_trans_committed))
4421 return 0; 4509 return 0;
4422 4510
4423 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4511 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4424} 4512}
4425 4513
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 23#define BTRFS_NO_LOG_SYNC 256
24 24
25struct btrfs_log_ctx {
26 int log_ret;
27 int log_transid;
28 struct list_head list;
29};
30
31static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
32{
33 ctx->log_ret = 0;
34 ctx->log_transid = 0;
35 INIT_LIST_HEAD(&ctx->list);
36}
37
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 38int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 39 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 41int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info); 42 struct btrfs_fs_info *fs_info);
30int btrfs_recover_log_trees(struct btrfs_root *tree_root); 43int btrfs_recover_log_trees(struct btrfs_root *tree_root);
31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 44int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct dentry *dentry); 45 struct btrfs_root *root, struct dentry *dentry,
46 struct btrfs_log_ctx *ctx);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 47int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 48 struct btrfs_root *root,
35 const char *name, int name_len, 49 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..d241130a32fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
415 device->running_pending = 1; 415 device->running_pending = 1;
416 416
417 spin_unlock(&device->io_lock); 417 spin_unlock(&device->io_lock);
418 btrfs_requeue_work(&device->work); 418 btrfs_queue_work(fs_info->submit_workers,
419 &device->work);
419 goto done; 420 goto done;
420 } 421 }
421 /* unplug every 64 requests just for good measure */ 422 /* unplug every 64 requests just for good measure */
@@ -5263,6 +5264,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5263static void btrfs_end_bio(struct bio *bio, int err) 5264static void btrfs_end_bio(struct bio *bio, int err)
5264{ 5265{
5265 struct btrfs_bio *bbio = bio->bi_private; 5266 struct btrfs_bio *bbio = bio->bi_private;
5267 struct btrfs_device *dev = bbio->stripes[0].dev;
5266 int is_orig_bio = 0; 5268 int is_orig_bio = 0;
5267 5269
5268 if (err) { 5270 if (err) {
@@ -5270,7 +5272,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5270 if (err == -EIO || err == -EREMOTEIO) { 5272 if (err == -EIO || err == -EREMOTEIO) {
5271 unsigned int stripe_index = 5273 unsigned int stripe_index =
5272 btrfs_io_bio(bio)->stripe_index; 5274 btrfs_io_bio(bio)->stripe_index;
5273 struct btrfs_device *dev;
5274 5275
5275 BUG_ON(stripe_index >= bbio->num_stripes); 5276 BUG_ON(stripe_index >= bbio->num_stripes);
5276 dev = bbio->stripes[stripe_index].dev; 5277 dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5293,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5292 if (bio == bbio->orig_bio) 5293 if (bio == bbio->orig_bio)
5293 is_orig_bio = 1; 5294 is_orig_bio = 1;
5294 5295
5296 btrfs_bio_counter_dec(bbio->fs_info);
5297
5295 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5298 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5296 if (!is_orig_bio) { 5299 if (!is_orig_bio) {
5297 bio_put(bio); 5300 bio_put(bio);
@@ -5328,13 +5331,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5328 } 5331 }
5329} 5332}
5330 5333
5331struct async_sched {
5332 struct bio *bio;
5333 int rw;
5334 struct btrfs_fs_info *info;
5335 struct btrfs_work work;
5336};
5337
5338/* 5334/*
5339 * see run_scheduled_bios for a description of why bios are collected for 5335 * see run_scheduled_bios for a description of why bios are collected for
5340 * async submit. 5336 * async submit.
@@ -5391,8 +5387,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5391 spin_unlock(&device->io_lock); 5387 spin_unlock(&device->io_lock);
5392 5388
5393 if (should_queue) 5389 if (should_queue)
5394 btrfs_queue_worker(&root->fs_info->submit_workers, 5390 btrfs_queue_work(root->fs_info->submit_workers,
5395 &device->work); 5391 &device->work);
5396} 5392}
5397 5393
5398static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5394static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5443,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5447 } 5443 }
5448#endif 5444#endif
5449 bio->bi_bdev = dev->bdev; 5445 bio->bi_bdev = dev->bdev;
5446
5447 btrfs_bio_counter_inc_noblocked(root->fs_info);
5448
5450 if (async) 5449 if (async)
5451 btrfs_schedule_bio(root, dev, rw, bio); 5450 btrfs_schedule_bio(root, dev, rw, bio);
5452 else 5451 else
@@ -5515,28 +5514,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5515 length = bio->bi_iter.bi_size; 5514 length = bio->bi_iter.bi_size;
5516 map_length = length; 5515 map_length = length;
5517 5516
5517 btrfs_bio_counter_inc_blocked(root->fs_info);
5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5519 mirror_num, &raid_map); 5519 mirror_num, &raid_map);
5520 if (ret) /* -ENOMEM */ 5520 if (ret) {
5521 btrfs_bio_counter_dec(root->fs_info);
5521 return ret; 5522 return ret;
5523 }
5522 5524
5523 total_devs = bbio->num_stripes; 5525 total_devs = bbio->num_stripes;
5524 bbio->orig_bio = first_bio; 5526 bbio->orig_bio = first_bio;
5525 bbio->private = first_bio->bi_private; 5527 bbio->private = first_bio->bi_private;
5526 bbio->end_io = first_bio->bi_end_io; 5528 bbio->end_io = first_bio->bi_end_io;
5529 bbio->fs_info = root->fs_info;
5527 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5530 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5528 5531
5529 if (raid_map) { 5532 if (raid_map) {
5530 /* In this case, map_length has been set to the length of 5533 /* In this case, map_length has been set to the length of
5531 a single stripe; not the whole write */ 5534 a single stripe; not the whole write */
5532 if (rw & WRITE) { 5535 if (rw & WRITE) {
5533 return raid56_parity_write(root, bio, bbio, 5536 ret = raid56_parity_write(root, bio, bbio,
5534 raid_map, map_length); 5537 raid_map, map_length);
5535 } else { 5538 } else {
5536 return raid56_parity_recover(root, bio, bbio, 5539 ret = raid56_parity_recover(root, bio, bbio,
5537 raid_map, map_length, 5540 raid_map, map_length,
5538 mirror_num); 5541 mirror_num);
5539 } 5542 }
5543 /*
5544 * FIXME, replace dosen't support raid56 yet, please fix
5545 * it in the future.
5546 */
5547 btrfs_bio_counter_dec(root->fs_info);
5548 return ret;
5540 } 5549 }
5541 5550
5542 if (map_length < length) { 5551 if (map_length < length) {
@@ -5578,6 +5587,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5578 async_submit); 5587 async_submit);
5579 dev_nr++; 5588 dev_nr++;
5580 } 5589 }
5590 btrfs_bio_counter_dec(root->fs_info);
5581 return 0; 5591 return 0;
5582} 5592}
5583 5593
@@ -5666,7 +5676,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5666 else 5676 else
5667 generate_random_uuid(dev->uuid); 5677 generate_random_uuid(dev->uuid);
5668 5678
5669 dev->work.func = pending_bios_fn; 5679 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
5670 5680
5671 return dev; 5681 return dev;
5672} 5682}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193struct btrfs_bio { 193struct btrfs_bio {
194 atomic_t stripes_pending; 194 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info;
195 bio_end_io_t *end_io; 196 bio_end_io_t *end_io;
196 struct bio *orig_bio; 197 struct bio *orig_bio;
197 void *private; 198 void *private;