aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c850
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c117
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c105
-rw-r--r--fs/btrfs/ctree.h86
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/dev-replace.c79
-rw-r--r--fs/btrfs/disk-io.c304
-rw-r--r--fs/btrfs/extent-tree.c93
-rw-r--r--fs/btrfs/extent_io.c23
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c56
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c182
-rw-r--r--fs/btrfs/inode-map.c14
-rw-r--r--fs/btrfs/inode.c151
-rw-r--r--fs/btrfs/ioctl.c239
-rw-r--r--fs/btrfs/ordered-data.c68
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c15
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c23
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c205
-rw-r--r--fs/btrfs/send.c872
-rw-r--r--fs/btrfs/super.c60
-rw-r--r--fs/btrfs/sysfs.c33
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/transaction.c87
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c81
-rw-r--r--fs/btrfs/volumes.h1
38 files changed, 2344 insertions, 1879 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..5a201d81049c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -21,708 +22,315 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
24#include "async-thread.h" 26#include "async-thread.h"
27#include "ctree.h"
28
29#define WORK_DONE_BIT 0
30#define WORK_ORDER_DONE_BIT 1
31#define WORK_HIGH_PRIO_BIT 2
32
33#define NO_THRESHOLD (-1)
34#define DFT_THRESHOLD (32)
35
36struct __btrfs_workqueue {
37 struct workqueue_struct *normal_wq;
38 /* List head pointing to ordered work list */
39 struct list_head ordered_list;
40
41 /* Spinlock for ordered_list */
42 spinlock_t list_lock;
43
44 /* Thresholding related variants */
45 atomic_t pending;
46 int max_active;
47 int current_max;
48 int thresh;
49 unsigned int count;
50 spinlock_t thres_lock;
51};
25 52
26#define WORK_QUEUED_BIT 0 53struct btrfs_workqueue {
27#define WORK_DONE_BIT 1 54 struct __btrfs_workqueue *normal;
28#define WORK_ORDER_DONE_BIT 2 55 struct __btrfs_workqueue *high;
29#define WORK_HIGH_PRIO_BIT 3 56};
30
31/*
32 * container for the kthread task pointer and the list of pending work
33 * One of these is allocated per thread.
34 */
35struct btrfs_worker_thread {
36 /* pool we belong to */
37 struct btrfs_workers *workers;
38
39 /* list of struct btrfs_work that are waiting for service */
40 struct list_head pending;
41 struct list_head prio_pending;
42
43 /* list of worker threads from struct btrfs_workers */
44 struct list_head worker_list;
45
46 /* kthread */
47 struct task_struct *task;
48 57
49 /* number of things on the pending list */ 58static inline struct __btrfs_workqueue
50 atomic_t num_pending; 59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh)
61{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
51 63
52 /* reference counter for this struct */ 64 if (unlikely(!ret))
53 atomic_t refs; 65 return NULL;
54 66
55 unsigned long sequence; 67 ret->max_active = max_active;
68 atomic_set(&ret->pending, 0);
69 if (thresh == 0)
70 thresh = DFT_THRESHOLD;
71 /* For low threshold, disabling threshold is a better choice */
72 if (thresh < DFT_THRESHOLD) {
73 ret->current_max = max_active;
74 ret->thresh = NO_THRESHOLD;
75 } else {
76 ret->current_max = 1;
77 ret->thresh = thresh;
78 }
56 79
57 /* protects the pending list. */ 80 if (flags & WQ_HIGHPRI)
58 spinlock_t lock; 81 ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
82 ret->max_active,
83 "btrfs", name);
84 else
85 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs",
87 name);
88 if (unlikely(!ret->normal_wq)) {
89 kfree(ret);
90 return NULL;
91 }
59 92
60 /* set to non-zero when this thread is already awake and kicking */ 93 INIT_LIST_HEAD(&ret->ordered_list);
61 int working; 94 spin_lock_init(&ret->list_lock);
95 spin_lock_init(&ret->thres_lock);
96 trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
97 return ret;
98}
62 99
63 /* are we currently idle */ 100static inline void
64 int idle; 101__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
65};
66 102
67static int __btrfs_start_workers(struct btrfs_workers *workers); 103struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
104 int flags,
105 int max_active,
106 int thresh)
107{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
68 109
69/* 110 if (unlikely(!ret))
70 * btrfs_start_workers uses kthread_run, which can block waiting for memory 111 return NULL;
71 * for a very long time. It will actually throttle on page writeback,
72 * and so it may not make progress until after our btrfs worker threads
73 * process all of the pending work structs in their queue
74 *
75 * This means we can't use btrfs_start_workers from inside a btrfs worker
76 * thread that is used as part of cleaning dirty memory, which pretty much
77 * involves all of the worker threads.
78 *
79 * Instead we have a helper queue who never has more than one thread
80 * where we scheduler thread start operations. This worker_start struct
81 * is used to contain the work and hold a pointer to the queue that needs
82 * another worker.
83 */
84struct worker_start {
85 struct btrfs_work work;
86 struct btrfs_workers *queue;
87};
88 112
89static void start_new_worker_func(struct btrfs_work *work) 113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
90{ 114 max_active, thresh);
91 struct worker_start *start; 115 if (unlikely(!ret->normal)) {
92 start = container_of(work, struct worker_start, work); 116 kfree(ret);
93 __btrfs_start_workers(start->queue); 117 return NULL;
94 kfree(start); 118 }
95}
96 119
97/* 120 if (flags & WQ_HIGHPRI) {
98 * helper function to move a thread onto the idle list after it 121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
99 * has finished some requests. 122 thresh);
100 */ 123 if (unlikely(!ret->high)) {
101static void check_idle_worker(struct btrfs_worker_thread *worker) 124 __btrfs_destroy_workqueue(ret->normal);
102{ 125 kfree(ret);
103 if (!worker->idle && atomic_read(&worker->num_pending) < 126 return NULL;
104 worker->workers->idle_thresh / 2) {
105 unsigned long flags;
106 spin_lock_irqsave(&worker->workers->lock, flags);
107 worker->idle = 1;
108
109 /* the list may be empty if the worker is just starting */
110 if (!list_empty(&worker->worker_list) &&
111 !worker->workers->stopping) {
112 list_move(&worker->worker_list,
113 &worker->workers->idle_list);
114 } 127 }
115 spin_unlock_irqrestore(&worker->workers->lock, flags);
116 } 128 }
129 return ret;
117} 130}
118 131
119/* 132/*
120 * helper function to move a thread off the idle list after new 133 * Hook for threshold which will be called in btrfs_queue_work.
121 * pending work is added. 134 * This hook WILL be called in IRQ handler context,
135 * so workqueue_set_max_active MUST NOT be called in this hook
122 */ 136 */
123static void check_busy_worker(struct btrfs_worker_thread *worker) 137static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
124{ 138{
125 if (worker->idle && atomic_read(&worker->num_pending) >= 139 if (wq->thresh == NO_THRESHOLD)
126 worker->workers->idle_thresh) { 140 return;
127 unsigned long flags; 141 atomic_inc(&wq->pending);
128 spin_lock_irqsave(&worker->workers->lock, flags);
129 worker->idle = 0;
130
131 if (!list_empty(&worker->worker_list) &&
132 !worker->workers->stopping) {
133 list_move_tail(&worker->worker_list,
134 &worker->workers->worker_list);
135 }
136 spin_unlock_irqrestore(&worker->workers->lock, flags);
137 }
138} 142}
139 143
140static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 144/*
145 * Hook for threshold which will be called before executing the work,
146 * This hook is called in kthread content.
147 * So workqueue_set_max_active is called here.
148 */
149static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
141{ 150{
142 struct btrfs_workers *workers = worker->workers; 151 int new_max_active;
143 struct worker_start *start; 152 long pending;
144 unsigned long flags; 153 int need_change = 0;
145
146 rmb();
147 if (!workers->atomic_start_pending)
148 return;
149 154
150 start = kzalloc(sizeof(*start), GFP_NOFS); 155 if (wq->thresh == NO_THRESHOLD)
151 if (!start)
152 return; 156 return;
153 157
154 start->work.func = start_new_worker_func; 158 atomic_dec(&wq->pending);
155 start->queue = workers; 159 spin_lock(&wq->thres_lock);
156 160 /*
157 spin_lock_irqsave(&workers->lock, flags); 161 * Use wq->count to limit the calling frequency of
158 if (!workers->atomic_start_pending) 162 * workqueue_set_max_active.
159 goto out; 163 */
160 164 wq->count++;
161 workers->atomic_start_pending = 0; 165 wq->count %= (wq->thresh / 4);
162 if (workers->num_workers + workers->num_workers_starting >= 166 if (!wq->count)
163 workers->max_workers) 167 goto out;
164 goto out; 168 new_max_active = wq->current_max;
165
166 workers->num_workers_starting += 1;
167 spin_unlock_irqrestore(&workers->lock, flags);
168 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
169 return;
170 169
170 /*
171 * pending may be changed later, but it's OK since we really
172 * don't need it so accurate to calculate new_max_active.
173 */
174 pending = atomic_read(&wq->pending);
175 if (pending > wq->thresh)
176 new_max_active++;
177 if (pending < wq->thresh / 2)
178 new_max_active--;
179 new_max_active = clamp_val(new_max_active, 1, wq->max_active);
180 if (new_max_active != wq->current_max) {
181 need_change = 1;
182 wq->current_max = new_max_active;
183 }
171out: 184out:
172 kfree(start); 185 spin_unlock(&wq->thres_lock);
173 spin_unlock_irqrestore(&workers->lock, flags); 186
187 if (need_change) {
188 workqueue_set_max_active(wq->normal_wq, wq->current_max);
189 }
174} 190}
175 191
176static noinline void run_ordered_completions(struct btrfs_workers *workers, 192static void run_ordered_work(struct __btrfs_workqueue *wq)
177 struct btrfs_work *work)
178{ 193{
179 if (!workers->ordered) 194 struct list_head *list = &wq->ordered_list;
180 return; 195 struct btrfs_work *work;
181 196 spinlock_t *lock = &wq->list_lock;
182 set_bit(WORK_DONE_BIT, &work->flags); 197 unsigned long flags;
183
184 spin_lock(&workers->order_lock);
185 198
186 while (1) { 199 while (1) {
187 if (!list_empty(&workers->prio_order_list)) { 200 spin_lock_irqsave(lock, flags);
188 work = list_entry(workers->prio_order_list.next, 201 if (list_empty(list))
189 struct btrfs_work, order_list);
190 } else if (!list_empty(&workers->order_list)) {
191 work = list_entry(workers->order_list.next,
192 struct btrfs_work, order_list);
193 } else {
194 break; 202 break;
195 } 203 work = list_entry(list->next, struct btrfs_work,
204 ordered_list);
196 if (!test_bit(WORK_DONE_BIT, &work->flags)) 205 if (!test_bit(WORK_DONE_BIT, &work->flags))
197 break; 206 break;
198 207
199 /* we are going to call the ordered done function, but 208 /*
209 * we are going to call the ordered done function, but
200 * we leave the work item on the list as a barrier so 210 * we leave the work item on the list as a barrier so
201 * that later work items that are done don't have their 211 * that later work items that are done don't have their
202 * functions called before this one returns 212 * functions called before this one returns
203 */ 213 */
204 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 214 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
205 break; 215 break;
206 216 trace_btrfs_ordered_sched(work);
207 spin_unlock(&workers->order_lock); 217 spin_unlock_irqrestore(lock, flags);
208
209 work->ordered_func(work); 218 work->ordered_func(work);
210 219
211 /* now take the lock again and drop our item from the list */ 220 /* now take the lock again and drop our item from the list */
212 spin_lock(&workers->order_lock); 221 spin_lock_irqsave(lock, flags);
213 list_del(&work->order_list); 222 list_del(&work->ordered_list);
214 spin_unlock(&workers->order_lock); 223 spin_unlock_irqrestore(lock, flags);
215 224
216 /* 225 /*
217 * we don't want to call the ordered free functions 226 * we don't want to call the ordered free functions
218 * with the lock held though 227 * with the lock held though
219 */ 228 */
220 work->ordered_free(work); 229 work->ordered_free(work);
221 spin_lock(&workers->order_lock); 230 trace_btrfs_all_work_done(work);
222 }
223
224 spin_unlock(&workers->order_lock);
225}
226
227static void put_worker(struct btrfs_worker_thread *worker)
228{
229 if (atomic_dec_and_test(&worker->refs))
230 kfree(worker);
231}
232
233static int try_worker_shutdown(struct btrfs_worker_thread *worker)
234{
235 int freeit = 0;
236
237 spin_lock_irq(&worker->lock);
238 spin_lock(&worker->workers->lock);
239 if (worker->workers->num_workers > 1 &&
240 worker->idle &&
241 !worker->working &&
242 !list_empty(&worker->worker_list) &&
243 list_empty(&worker->prio_pending) &&
244 list_empty(&worker->pending) &&
245 atomic_read(&worker->num_pending) == 0) {
246 freeit = 1;
247 list_del_init(&worker->worker_list);
248 worker->workers->num_workers--;
249 } 231 }
250 spin_unlock(&worker->workers->lock); 232 spin_unlock_irqrestore(lock, flags);
251 spin_unlock_irq(&worker->lock);
252
253 if (freeit)
254 put_worker(worker);
255 return freeit;
256} 233}
257 234
258static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, 235static void normal_work_helper(struct work_struct *arg)
259 struct list_head *prio_head,
260 struct list_head *head)
261{ 236{
262 struct btrfs_work *work = NULL;
263 struct list_head *cur = NULL;
264
265 if (!list_empty(prio_head))
266 cur = prio_head->next;
267
268 smp_mb();
269 if (!list_empty(&worker->prio_pending))
270 goto refill;
271
272 if (!list_empty(head))
273 cur = head->next;
274
275 if (cur)
276 goto out;
277
278refill:
279 spin_lock_irq(&worker->lock);
280 list_splice_tail_init(&worker->prio_pending, prio_head);
281 list_splice_tail_init(&worker->pending, head);
282
283 if (!list_empty(prio_head))
284 cur = prio_head->next;
285 else if (!list_empty(head))
286 cur = head->next;
287 spin_unlock_irq(&worker->lock);
288
289 if (!cur)
290 goto out_fail;
291
292out:
293 work = list_entry(cur, struct btrfs_work, list);
294
295out_fail:
296 return work;
297}
298
299/*
300 * main loop for servicing work items
301 */
302static int worker_loop(void *arg)
303{
304 struct btrfs_worker_thread *worker = arg;
305 struct list_head head;
306 struct list_head prio_head;
307 struct btrfs_work *work; 237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq;
239 int need_order = 0;
308 240
309 INIT_LIST_HEAD(&head); 241 work = container_of(arg, struct btrfs_work, normal_work);
310 INIT_LIST_HEAD(&prio_head); 242 /*
311 243 * We should not touch things inside work in the following cases:
312 do { 244 * 1) after work->func() if it has no ordered_free
313again: 245 * Since the struct is freed in work->func().
314 while (1) { 246 * 2) after setting WORK_DONE_BIT
315 247 * The work may be freed in other threads almost instantly.
316 248 * So we save the needed things here.
317 work = get_next_work(worker, &prio_head, &head); 249 */
318 if (!work) 250 if (work->ordered_func)
319 break; 251 need_order = 1;
320 252 wq = work->wq;
321 list_del(&work->list); 253
322 clear_bit(WORK_QUEUED_BIT, &work->flags); 254 trace_btrfs_work_sched(work);
323 255 thresh_exec_hook(wq);
324 work->worker = worker; 256 work->func(work);
325 257 if (need_order) {
326 work->func(work); 258 set_bit(WORK_DONE_BIT, &work->flags);
327 259 run_ordered_work(wq);
328 atomic_dec(&worker->num_pending);
329 /*
330 * unless this is an ordered work queue,
331 * 'work' was probably freed by func above.
332 */
333 run_ordered_completions(worker->workers, work);
334
335 check_pending_worker_creates(worker);
336 cond_resched();
337 }
338
339 spin_lock_irq(&worker->lock);
340 check_idle_worker(worker);
341
342 if (freezing(current)) {
343 worker->working = 0;
344 spin_unlock_irq(&worker->lock);
345 try_to_freeze();
346 } else {
347 spin_unlock_irq(&worker->lock);
348 if (!kthread_should_stop()) {
349 cpu_relax();
350 /*
351 * we've dropped the lock, did someone else
352 * jump_in?
353 */
354 smp_mb();
355 if (!list_empty(&worker->pending) ||
356 !list_empty(&worker->prio_pending))
357 continue;
358
359 /*
360 * this short schedule allows more work to
361 * come in without the queue functions
362 * needing to go through wake_up_process()
363 *
364 * worker->working is still 1, so nobody
365 * is going to try and wake us up
366 */
367 schedule_timeout(1);
368 smp_mb();
369 if (!list_empty(&worker->pending) ||
370 !list_empty(&worker->prio_pending))
371 continue;
372
373 if (kthread_should_stop())
374 break;
375
376 /* still no more work?, sleep for real */
377 spin_lock_irq(&worker->lock);
378 set_current_state(TASK_INTERRUPTIBLE);
379 if (!list_empty(&worker->pending) ||
380 !list_empty(&worker->prio_pending)) {
381 spin_unlock_irq(&worker->lock);
382 set_current_state(TASK_RUNNING);
383 goto again;
384 }
385
386 /*
387 * this makes sure we get a wakeup when someone
388 * adds something new to the queue
389 */
390 worker->working = 0;
391 spin_unlock_irq(&worker->lock);
392
393 if (!kthread_should_stop()) {
394 schedule_timeout(HZ * 120);
395 if (!worker->working &&
396 try_worker_shutdown(worker)) {
397 return 0;
398 }
399 }
400 }
401 __set_current_state(TASK_RUNNING);
402 }
403 } while (!kthread_should_stop());
404 return 0;
405}
406
407/*
408 * this will wait for all the worker threads to shutdown
409 */
410void btrfs_stop_workers(struct btrfs_workers *workers)
411{
412 struct list_head *cur;
413 struct btrfs_worker_thread *worker;
414 int can_stop;
415
416 spin_lock_irq(&workers->lock);
417 workers->stopping = 1;
418 list_splice_init(&workers->idle_list, &workers->worker_list);
419 while (!list_empty(&workers->worker_list)) {
420 cur = workers->worker_list.next;
421 worker = list_entry(cur, struct btrfs_worker_thread,
422 worker_list);
423
424 atomic_inc(&worker->refs);
425 workers->num_workers -= 1;
426 if (!list_empty(&worker->worker_list)) {
427 list_del_init(&worker->worker_list);
428 put_worker(worker);
429 can_stop = 1;
430 } else
431 can_stop = 0;
432 spin_unlock_irq(&workers->lock);
433 if (can_stop)
434 kthread_stop(worker->task);
435 spin_lock_irq(&workers->lock);
436 put_worker(worker);
437 } 260 }
438 spin_unlock_irq(&workers->lock); 261 if (!need_order)
262 trace_btrfs_all_work_done(work);
439} 263}
440 264
441/* 265void btrfs_init_work(struct btrfs_work *work,
442 * simple init on struct btrfs_workers 266 btrfs_func_t func,
443 */ 267 btrfs_func_t ordered_func,
444void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 268 btrfs_func_t ordered_free)
445 struct btrfs_workers *async_helper)
446{ 269{
447 workers->num_workers = 0; 270 work->func = func;
448 workers->num_workers_starting = 0; 271 work->ordered_func = ordered_func;
449 INIT_LIST_HEAD(&workers->worker_list); 272 work->ordered_free = ordered_free;
450 INIT_LIST_HEAD(&workers->idle_list); 273 INIT_WORK(&work->normal_work, normal_work_helper);
451 INIT_LIST_HEAD(&workers->order_list); 274 INIT_LIST_HEAD(&work->ordered_list);
452 INIT_LIST_HEAD(&workers->prio_order_list); 275 work->flags = 0;
453 spin_lock_init(&workers->lock);
454 spin_lock_init(&workers->order_lock);
455 workers->max_workers = max;
456 workers->idle_thresh = 32;
457 workers->name = name;
458 workers->ordered = 0;
459 workers->atomic_start_pending = 0;
460 workers->atomic_worker_start = async_helper;
461 workers->stopping = 0;
462} 276}
463 277
464/* 278static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
465 * starts new worker threads. This does not enforce the max worker 279 struct btrfs_work *work)
466 * count in case you need to temporarily go past it.
467 */
468static int __btrfs_start_workers(struct btrfs_workers *workers)
469{ 280{
470 struct btrfs_worker_thread *worker; 281 unsigned long flags;
471 int ret = 0;
472
473 worker = kzalloc(sizeof(*worker), GFP_NOFS);
474 if (!worker) {
475 ret = -ENOMEM;
476 goto fail;
477 }
478
479 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock);
483
484 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1);
486 worker->workers = workers;
487 worker->task = kthread_create(worker_loop, worker,
488 "btrfs-%s-%d", workers->name,
489 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task);
492 goto fail;
493 }
494 282
495 spin_lock_irq(&workers->lock); 283 work->wq = wq;
496 if (workers->stopping) { 284 thresh_queue_hook(wq);
497 spin_unlock_irq(&workers->lock); 285 if (work->ordered_func) {
498 ret = -EINVAL; 286 spin_lock_irqsave(&wq->list_lock, flags);
499 goto fail_kthread; 287 list_add_tail(&work->ordered_list, &wq->ordered_list);
288 spin_unlock_irqrestore(&wq->list_lock, flags);
500 } 289 }
501 list_add_tail(&worker->worker_list, &workers->idle_list); 290 queue_work(wq->normal_wq, &work->normal_work);
502 worker->idle = 1; 291 trace_btrfs_work_queued(work);
503 workers->num_workers++;
504 workers->num_workers_starting--;
505 WARN_ON(workers->num_workers_starting < 0);
506 spin_unlock_irq(&workers->lock);
507
508 wake_up_process(worker->task);
509 return 0;
510
511fail_kthread:
512 kthread_stop(worker->task);
513fail:
514 kfree(worker);
515 spin_lock_irq(&workers->lock);
516 workers->num_workers_starting--;
517 spin_unlock_irq(&workers->lock);
518 return ret;
519} 292}
520 293
521int btrfs_start_workers(struct btrfs_workers *workers) 294void btrfs_queue_work(struct btrfs_workqueue *wq,
522{ 295 struct btrfs_work *work)
523 spin_lock_irq(&workers->lock);
524 workers->num_workers_starting++;
525 spin_unlock_irq(&workers->lock);
526 return __btrfs_start_workers(workers);
527}
528
529/*
530 * run through the list and find a worker thread that doesn't have a lot
531 * to do right now. This can return null if we aren't yet at the thread
532 * count limit and all of the threads are busy.
533 */
534static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
535{ 296{
536 struct btrfs_worker_thread *worker; 297 struct __btrfs_workqueue *dest_wq;
537 struct list_head *next;
538 int enforce_min;
539 298
540 enforce_min = (workers->num_workers + workers->num_workers_starting) < 299 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
541 workers->max_workers; 300 dest_wq = wq->high;
542 301 else
543 /* 302 dest_wq = wq->normal;
544 * if we find an idle thread, don't move it to the end of the 303 __btrfs_queue_work(dest_wq, work);
545 * idle list. This improves the chance that the next submission
546 * will reuse the same thread, and maybe catch it while it is still
547 * working
548 */
549 if (!list_empty(&workers->idle_list)) {
550 next = workers->idle_list.next;
551 worker = list_entry(next, struct btrfs_worker_thread,
552 worker_list);
553 return worker;
554 }
555 if (enforce_min || list_empty(&workers->worker_list))
556 return NULL;
557
558 /*
559 * if we pick a busy task, move the task to the end of the list.
560 * hopefully this will keep things somewhat evenly balanced.
561 * Do the move in batches based on the sequence number. This groups
562 * requests submitted at roughly the same time onto the same worker.
563 */
564 next = workers->worker_list.next;
565 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
566 worker->sequence++;
567
568 if (worker->sequence % workers->idle_thresh == 0)
569 list_move_tail(next, &workers->worker_list);
570 return worker;
571} 304}
572 305
573/* 306static inline void
574 * selects a worker thread to take the next job. This will either find 307__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
575 * an idle worker, start a new worker up to the max count, or just return
576 * one of the existing busy workers.
577 */
578static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
579{ 308{
580 struct btrfs_worker_thread *worker; 309 destroy_workqueue(wq->normal_wq);
581 unsigned long flags; 310 trace_btrfs_workqueue_destroy(wq);
582 struct list_head *fallback; 311 kfree(wq);
583 int ret;
584
585 spin_lock_irqsave(&workers->lock, flags);
586again:
587 worker = next_worker(workers);
588
589 if (!worker) {
590 if (workers->num_workers + workers->num_workers_starting >=
591 workers->max_workers) {
592 goto fallback;
593 } else if (workers->atomic_worker_start) {
594 workers->atomic_start_pending = 1;
595 goto fallback;
596 } else {
597 workers->num_workers_starting++;
598 spin_unlock_irqrestore(&workers->lock, flags);
599 /* we're below the limit, start another worker */
600 ret = __btrfs_start_workers(workers);
601 spin_lock_irqsave(&workers->lock, flags);
602 if (ret)
603 goto fallback;
604 goto again;
605 }
606 }
607 goto found;
608
609fallback:
610 fallback = NULL;
611 /*
612 * we have failed to find any workers, just
613 * return the first one we can find.
614 */
615 if (!list_empty(&workers->worker_list))
616 fallback = workers->worker_list.next;
617 if (!list_empty(&workers->idle_list))
618 fallback = workers->idle_list.next;
619 BUG_ON(!fallback);
620 worker = list_entry(fallback,
621 struct btrfs_worker_thread, worker_list);
622found:
623 /*
624 * this makes sure the worker doesn't exit before it is placed
625 * onto a busy/idle list
626 */
627 atomic_inc(&worker->num_pending);
628 spin_unlock_irqrestore(&workers->lock, flags);
629 return worker;
630} 312}
631 313
632/* 314void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
633 * btrfs_requeue_work just puts the work item back on the tail of the list
634 * it was taken from. It is intended for use with long running work functions
635 * that make some progress and want to give the cpu up for others.
636 */
637void btrfs_requeue_work(struct btrfs_work *work)
638{ 315{
639 struct btrfs_worker_thread *worker = work->worker; 316 if (!wq)
640 unsigned long flags;
641 int wake = 0;
642
643 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
644 return; 317 return;
645 318 if (wq->high)
646 spin_lock_irqsave(&worker->lock, flags); 319 __btrfs_destroy_workqueue(wq->high);
647 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 320 __btrfs_destroy_workqueue(wq->normal);
648 list_add_tail(&work->list, &worker->prio_pending); 321 kfree(wq);
649 else
650 list_add_tail(&work->list, &worker->pending);
651 atomic_inc(&worker->num_pending);
652
653 /* by definition we're busy, take ourselves off the idle
654 * list
655 */
656 if (worker->idle) {
657 spin_lock(&worker->workers->lock);
658 worker->idle = 0;
659 list_move_tail(&worker->worker_list,
660 &worker->workers->worker_list);
661 spin_unlock(&worker->workers->lock);
662 }
663 if (!worker->working) {
664 wake = 1;
665 worker->working = 1;
666 }
667
668 if (wake)
669 wake_up_process(worker->task);
670 spin_unlock_irqrestore(&worker->lock, flags);
671} 322}
672 323
673void btrfs_set_work_high_prio(struct btrfs_work *work) 324void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
674{ 325{
675 set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 326 if (!wq)
327 return;
328 wq->normal->max_active = max;
329 if (wq->high)
330 wq->high->max_active = max;
676} 331}
677 332
678/* 333void btrfs_set_work_high_priority(struct btrfs_work *work)
679 * places a struct btrfs_work into the pending queue of one of the kthreads
680 */
681void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
682{ 334{
683 struct btrfs_worker_thread *worker; 335 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
684 unsigned long flags;
685 int wake = 0;
686
687 /* don't requeue something already on a list */
688 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
689 return;
690
691 worker = find_worker(workers);
692 if (workers->ordered) {
693 /*
694 * you're not allowed to do ordered queues from an
695 * interrupt handler
696 */
697 spin_lock(&workers->order_lock);
698 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
699 list_add_tail(&work->order_list,
700 &workers->prio_order_list);
701 } else {
702 list_add_tail(&work->order_list, &workers->order_list);
703 }
704 spin_unlock(&workers->order_lock);
705 } else {
706 INIT_LIST_HEAD(&work->order_list);
707 }
708
709 spin_lock_irqsave(&worker->lock, flags);
710
711 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
712 list_add_tail(&work->list, &worker->prio_pending);
713 else
714 list_add_tail(&work->list, &worker->pending);
715 check_busy_worker(worker);
716
717 /*
718 * avoid calling into wake_up_process if this thread has already
719 * been kicked
720 */
721 if (!worker->working)
722 wake = 1;
723 worker->working = 1;
724
725 if (wake)
726 wake_up_process(worker->task);
727 spin_unlock_irqrestore(&worker->lock, flags);
728} 336}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
19#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
21 22
22struct btrfs_worker_thread; 23struct btrfs_workqueue;
24/* Internal use only */
25struct __btrfs_workqueue;
26struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg);
23 28
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work { 29struct btrfs_work {
39 /* 30 btrfs_func_t func;
40 * func should be set to the function you want called 31 btrfs_func_t ordered_func;
41 * your work struct is passed as the only arg 32 btrfs_func_t ordered_free;
42 * 33
43 * ordered_func must be set for work sent to an ordered work queue, 34 /* Don't touch things below */
44 * and it is called to complete a given work item in the same 35 struct work_struct normal_work;
45 * order they were sent to the queue. 36 struct list_head ordered_list;
46 */ 37 struct __btrfs_workqueue *wq;
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags; 38 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 int num_workers_starting;
68
69 /* max number of workers allowed. changed by btrfs_start_workers */
70 int max_workers;
71
72 /* once a worker has this many requests or fewer, it is idle */
73 int idle_thresh;
74
75 /* force completions in the order they were queued */
76 int ordered;
77
78 /* more workers required, but in an interrupt handler */
79 int atomic_start_pending;
80
81 /*
82 * are we allowed to sleep while starting workers or are we required
83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
85 */
86 struct btrfs_workers *atomic_worker_start;
87
88 /* list with all the work threads. The workers on the idle thread
89 * may be actively servicing jobs, but they haven't yet hit the
90 * idle thresh limit above.
91 */
92 struct list_head worker_list;
93 struct list_head idle_list;
94
95 /*
96 * when operating in ordered mode, this maintains the list
97 * of work items waiting for completion
98 */
99 struct list_head order_list;
100 struct list_head prio_order_list;
101
102 /* lock for finding the next worker thread to queue on */
103 spinlock_t lock;
104
105 /* lock for the ordered lists */
106 spinlock_t order_lock;
107
108 /* extra name for this worker, used for current->name */
109 char *name;
110
111 int stopping;
112}; 39};
113 40
114void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
115int btrfs_start_workers(struct btrfs_workers *workers); 42 int flags,
116void btrfs_stop_workers(struct btrfs_workers *workers); 43 int max_active,
117void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 44 int thresh);
118 struct btrfs_workers *async_starter); 45void btrfs_init_work(struct btrfs_work *work,
119void btrfs_requeue_work(struct btrfs_work *work); 46 btrfs_func_t func,
120void btrfs_set_work_high_prio(struct btrfs_work *work); 47 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free);
49void btrfs_queue_work(struct btrfs_workqueue *wq,
50 struct btrfs_work *work);
51void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
52void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
53void btrfs_set_work_high_priority(struct btrfs_work *work);
121#endif 54#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..10db21fa0926 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
220 220
221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
222 struct ulist *parents, struct __prelim_ref *ref, 222 struct ulist *parents, struct __prelim_ref *ref,
223 int level, u64 time_seq, const u64 *extent_item_pos) 223 int level, u64 time_seq, const u64 *extent_item_pos,
224 u64 total_refs)
224{ 225{
225 int ret = 0; 226 int ret = 0;
226 int slot; 227 int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
249 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 250 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
250 ret = btrfs_next_old_leaf(root, path, time_seq); 251 ret = btrfs_next_old_leaf(root, path, time_seq);
251 252
252 while (!ret && count < ref->count) { 253 while (!ret && count < total_refs) {
253 eb = path->nodes[0]; 254 eb = path->nodes[0];
254 slot = path->slots[0]; 255 slot = path->slots[0];
255 256
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
306 struct btrfs_path *path, u64 time_seq, 307 struct btrfs_path *path, u64 time_seq,
307 struct __prelim_ref *ref, 308 struct __prelim_ref *ref,
308 struct ulist *parents, 309 struct ulist *parents,
309 const u64 *extent_item_pos) 310 const u64 *extent_item_pos, u64 total_refs)
310{ 311{
311 struct btrfs_root *root; 312 struct btrfs_root *root;
312 struct btrfs_key root_key; 313 struct btrfs_key root_key;
@@ -329,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
329 goto out; 330 goto out;
330 } 331 }
331 332
332 root_level = btrfs_old_root_level(root, time_seq); 333 if (path->search_commit_root)
334 root_level = btrfs_header_level(root->commit_root);
335 else
336 root_level = btrfs_old_root_level(root, time_seq);
333 337
334 if (root_level + 1 == level) { 338 if (root_level + 1 == level) {
335 srcu_read_unlock(&fs_info->subvol_srcu, index); 339 srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -361,7 +365,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
361 } 365 }
362 366
363 ret = add_all_parents(root, path, parents, ref, level, time_seq, 367 ret = add_all_parents(root, path, parents, ref, level, time_seq,
364 extent_item_pos); 368 extent_item_pos, total_refs);
365out: 369out:
366 path->lowest_level = 0; 370 path->lowest_level = 0;
367 btrfs_release_path(path); 371 btrfs_release_path(path);
@@ -374,7 +378,7 @@ out:
374static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 378static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 struct btrfs_path *path, u64 time_seq, 379 struct btrfs_path *path, u64 time_seq,
376 struct list_head *head, 380 struct list_head *head,
377 const u64 *extent_item_pos) 381 const u64 *extent_item_pos, u64 total_refs)
378{ 382{
379 int err; 383 int err;
380 int ret = 0; 384 int ret = 0;
@@ -400,7 +404,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
400 if (ref->count == 0) 404 if (ref->count == 0)
401 continue; 405 continue;
402 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 406 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
403 parents, extent_item_pos); 407 parents, extent_item_pos,
408 total_refs);
404 /* 409 /*
405 * we can only tolerate ENOENT,otherwise,we should catch error 410 * we can only tolerate ENOENT,otherwise,we should catch error
406 * and return directly. 411 * and return directly.
@@ -557,7 +562,7 @@ static void __merge_refs(struct list_head *head, int mode)
557 * smaller or equal that seq to the list 562 * smaller or equal that seq to the list
558 */ 563 */
559static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 564static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
560 struct list_head *prefs) 565 struct list_head *prefs, u64 *total_refs)
561{ 566{
562 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 567 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
563 struct rb_node *n = &head->node.rb_node; 568 struct rb_node *n = &head->node.rb_node;
@@ -593,6 +598,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
593 default: 598 default:
594 BUG_ON(1); 599 BUG_ON(1);
595 } 600 }
601 *total_refs += (node->ref_mod * sgn);
596 switch (node->type) { 602 switch (node->type) {
597 case BTRFS_TREE_BLOCK_REF_KEY: { 603 case BTRFS_TREE_BLOCK_REF_KEY: {
598 struct btrfs_delayed_tree_ref *ref; 604 struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +659,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
653 */ 659 */
654static int __add_inline_refs(struct btrfs_fs_info *fs_info, 660static int __add_inline_refs(struct btrfs_fs_info *fs_info,
655 struct btrfs_path *path, u64 bytenr, 661 struct btrfs_path *path, u64 bytenr,
656 int *info_level, struct list_head *prefs) 662 int *info_level, struct list_head *prefs,
663 u64 *total_refs)
657{ 664{
658 int ret = 0; 665 int ret = 0;
659 int slot; 666 int slot;
@@ -677,6 +684,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
677 684
678 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 685 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
679 flags = btrfs_extent_flags(leaf, ei); 686 flags = btrfs_extent_flags(leaf, ei);
687 *total_refs += btrfs_extent_refs(leaf, ei);
680 btrfs_item_key_to_cpu(leaf, &found_key, slot); 688 btrfs_item_key_to_cpu(leaf, &found_key, slot);
681 689
682 ptr = (unsigned long)(ei + 1); 690 ptr = (unsigned long)(ei + 1);
@@ -859,6 +867,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
859 struct list_head prefs; 867 struct list_head prefs;
860 struct __prelim_ref *ref; 868 struct __prelim_ref *ref;
861 struct extent_inode_elem *eie = NULL; 869 struct extent_inode_elem *eie = NULL;
870 u64 total_refs = 0;
862 871
863 INIT_LIST_HEAD(&prefs); 872 INIT_LIST_HEAD(&prefs);
864 INIT_LIST_HEAD(&prefs_delayed); 873 INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +882,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
873 path = btrfs_alloc_path(); 882 path = btrfs_alloc_path();
874 if (!path) 883 if (!path)
875 return -ENOMEM; 884 return -ENOMEM;
876 if (!trans) 885 if (!trans) {
877 path->search_commit_root = 1; 886 path->search_commit_root = 1;
887 path->skip_locking = 1;
888 }
878 889
879 /* 890 /*
880 * grab both a lock on the path and a lock on the delayed ref head. 891 * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +926,7 @@ again:
915 } 926 }
916 spin_unlock(&delayed_refs->lock); 927 spin_unlock(&delayed_refs->lock);
917 ret = __add_delayed_refs(head, time_seq, 928 ret = __add_delayed_refs(head, time_seq,
918 &prefs_delayed); 929 &prefs_delayed, &total_refs);
919 mutex_unlock(&head->mutex); 930 mutex_unlock(&head->mutex);
920 if (ret) 931 if (ret)
921 goto out; 932 goto out;
@@ -936,7 +947,8 @@ again:
936 (key.type == BTRFS_EXTENT_ITEM_KEY || 947 (key.type == BTRFS_EXTENT_ITEM_KEY ||
937 key.type == BTRFS_METADATA_ITEM_KEY)) { 948 key.type == BTRFS_METADATA_ITEM_KEY)) {
938 ret = __add_inline_refs(fs_info, path, bytenr, 949 ret = __add_inline_refs(fs_info, path, bytenr,
939 &info_level, &prefs); 950 &info_level, &prefs,
951 &total_refs);
940 if (ret) 952 if (ret)
941 goto out; 953 goto out;
942 ret = __add_keyed_refs(fs_info, path, bytenr, 954 ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +968,7 @@ again:
956 __merge_refs(&prefs, 1); 968 __merge_refs(&prefs, 1);
957 969
958 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 970 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
959 extent_item_pos); 971 extent_item_pos, total_refs);
960 if (ret) 972 if (ret)
961 goto out; 973 goto out;
962 974
@@ -965,7 +977,7 @@ again:
965 while (!list_empty(&prefs)) { 977 while (!list_empty(&prefs)) {
966 ref = list_first_entry(&prefs, struct __prelim_ref, list); 978 ref = list_first_entry(&prefs, struct __prelim_ref, list);
967 WARN_ON(ref->count < 0); 979 WARN_ON(ref->count < 0);
968 if (ref->count && ref->root_id && ref->parent == 0) { 980 if (roots && ref->count && ref->root_id && ref->parent == 0) {
969 /* no parent == root of tree */ 981 /* no parent == root of tree */
970 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 982 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
971 if (ret < 0) 983 if (ret < 0)
@@ -1061,22 +1073,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1061 u64 time_seq, struct ulist **leafs, 1073 u64 time_seq, struct ulist **leafs,
1062 const u64 *extent_item_pos) 1074 const u64 *extent_item_pos)
1063{ 1075{
1064 struct ulist *tmp;
1065 int ret; 1076 int ret;
1066 1077
1067 tmp = ulist_alloc(GFP_NOFS);
1068 if (!tmp)
1069 return -ENOMEM;
1070 *leafs = ulist_alloc(GFP_NOFS); 1078 *leafs = ulist_alloc(GFP_NOFS);
1071 if (!*leafs) { 1079 if (!*leafs)
1072 ulist_free(tmp);
1073 return -ENOMEM; 1080 return -ENOMEM;
1074 }
1075 1081
1076 ret = find_parent_nodes(trans, fs_info, bytenr, 1082 ret = find_parent_nodes(trans, fs_info, bytenr,
1077 time_seq, *leafs, tmp, extent_item_pos); 1083 time_seq, *leafs, NULL, extent_item_pos);
1078 ulist_free(tmp);
1079
1080 if (ret < 0 && ret != -ENOENT) { 1084 if (ret < 0 && ret != -ENOENT) {
1081 free_leaf_list(*leafs); 1085 free_leaf_list(*leafs);
1082 return ret; 1086 return ret;
@@ -1098,9 +1102,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1098 * 1102 *
1099 * returns 0 on success, < 0 on error. 1103 * returns 0 on success, < 0 on error.
1100 */ 1104 */
1101int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1105static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1102 struct btrfs_fs_info *fs_info, u64 bytenr, 1106 struct btrfs_fs_info *fs_info, u64 bytenr,
1103 u64 time_seq, struct ulist **roots) 1107 u64 time_seq, struct ulist **roots)
1104{ 1108{
1105 struct ulist *tmp; 1109 struct ulist *tmp;
1106 struct ulist_node *node = NULL; 1110 struct ulist_node *node = NULL;
@@ -1136,6 +1140,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1136 return 0; 1140 return 0;
1137} 1141}
1138 1142
1143int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1144 struct btrfs_fs_info *fs_info, u64 bytenr,
1145 u64 time_seq, struct ulist **roots)
1146{
1147 int ret;
1148
1149 if (!trans)
1150 down_read(&fs_info->commit_root_sem);
1151 ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
1152 if (!trans)
1153 up_read(&fs_info->commit_root_sem);
1154 return ret;
1155}
1156
1139/* 1157/*
1140 * this makes the path point to (inum INODE_ITEM ioff) 1158 * this makes the path point to (inum INODE_ITEM ioff)
1141 */ 1159 */
@@ -1333,38 +1351,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1333 if (ret < 0) 1351 if (ret < 0)
1334 return ret; 1352 return ret;
1335 1353
1336 while (1) { 1354 ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
1337 u32 nritems; 1355 if (ret) {
1338 if (path->slots[0] == 0) { 1356 if (ret > 0)
1339 btrfs_set_path_blocking(path); 1357 ret = -ENOENT;
1340 ret = btrfs_prev_leaf(fs_info->extent_root, path); 1358 return ret;
1341 if (ret != 0) {
1342 if (ret > 0) {
1343 pr_debug("logical %llu is not within "
1344 "any extent\n", logical);
1345 ret = -ENOENT;
1346 }
1347 return ret;
1348 }
1349 } else {
1350 path->slots[0]--;
1351 }
1352 nritems = btrfs_header_nritems(path->nodes[0]);
1353 if (nritems == 0) {
1354 pr_debug("logical %llu is not within any extent\n",
1355 logical);
1356 return -ENOENT;
1357 }
1358 if (path->slots[0] == nritems)
1359 path->slots[0]--;
1360
1361 btrfs_item_key_to_cpu(path->nodes[0], found_key,
1362 path->slots[0]);
1363 if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
1364 found_key->type == BTRFS_METADATA_ITEM_KEY)
1365 break;
1366 } 1359 }
1367 1360 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1361 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1362 size = fs_info->extent_root->leafsize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1363 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
@@ -1540,6 +1533,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1540 if (IS_ERR(trans)) 1533 if (IS_ERR(trans))
1541 return PTR_ERR(trans); 1534 return PTR_ERR(trans);
1542 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1535 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1536 } else {
1537 down_read(&fs_info->commit_root_sem);
1543 } 1538 }
1544 1539
1545 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1540 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
@@ -1550,8 +1545,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1550 1545
1551 ULIST_ITER_INIT(&ref_uiter); 1546 ULIST_ITER_INIT(&ref_uiter);
1552 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1547 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1553 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, 1548 ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
1554 tree_mod_seq_elem.seq, &roots); 1549 tree_mod_seq_elem.seq, &roots);
1555 if (ret) 1550 if (ret)
1556 break; 1551 break;
1557 ULIST_ITER_INIT(&root_uiter); 1552 ULIST_ITER_INIT(&root_uiter);
@@ -1573,6 +1568,8 @@ out:
1573 if (!search_commit_root) { 1568 if (!search_commit_root) {
1574 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1569 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1575 btrfs_end_transaction(trans, fs_info->extent_root); 1570 btrfs_end_transaction(trans, fs_info->extent_root);
1571 } else {
1572 up_read(&fs_info->commit_root_sem);
1576 } 1573 }
1577 1574
1578 return ret; 1575 return ret;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
109 u64 last_trans; 109 u64 last_trans;
110 110
111 /* 111 /*
112 * log transid when this inode was last modified 112 * transid that last logged this inode
113 */ 113 */
114 u64 last_sub_trans; 114 u64 logged_trans;
115 115
116 /* 116 /*
117 * transid that last logged this inode 117 * log transid when this inode was last modified
118 */ 118 */
119 u64 logged_trans; 119 int last_sub_trans;
120
121 /* a local copy of root's last_log_commit */
122 int last_log_commit;
120 123
121 /* total number of bytes pending delalloc, used by stat to calc the 124 /* total number of bytes pending delalloc, used by stat to calc the
122 * real block usage of the file 125 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
155 /* flags field from the on disk inode */ 158 /* flags field from the on disk inode */
156 u32 flags; 159 u32 flags;
157 160
158 /* a local copy of root's last_log_commit */
159 unsigned long last_log_commit;
160
161 /* 161 /*
162 * Counters to keep track of the number of extent item's we may use due 162 * Counters to keep track of the number of extent item's we may use due
163 * to delalloc and such. outstanding_extents is the number of extent 163 * to delalloc and such. outstanding_extents is the number of extent
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
472 rcu_read_lock(); 472 rcu_read_lock();
473 page = radix_tree_lookup(&mapping->page_tree, pg_index); 473 page = radix_tree_lookup(&mapping->page_tree, pg_index);
474 rcu_read_unlock(); 474 rcu_read_unlock();
475 if (page) { 475 if (page && !radix_tree_exceptional_entry(page)) {
476 misses++; 476 misses++;
477 if (misses > 4) 477 if (misses > 4)
478 break; 478 break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..1bcfcdb23cf4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2769,9 +2769,13 @@ again:
2769 * the commit roots are read only 2769 * the commit roots are read only
2770 * so we always do read locks 2770 * so we always do read locks
2771 */ 2771 */
2772 if (p->need_commit_sem)
2773 down_read(&root->fs_info->commit_root_sem);
2772 b = root->commit_root; 2774 b = root->commit_root;
2773 extent_buffer_get(b); 2775 extent_buffer_get(b);
2774 level = btrfs_header_level(b); 2776 level = btrfs_header_level(b);
2777 if (p->need_commit_sem)
2778 up_read(&root->fs_info->commit_root_sem);
2775 if (!p->skip_locking) 2779 if (!p->skip_locking)
2776 btrfs_tree_read_lock(b); 2780 btrfs_tree_read_lock(b);
2777 } else { 2781 } else {
@@ -5360,7 +5364,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5360{ 5364{
5361 int ret; 5365 int ret;
5362 int cmp; 5366 int cmp;
5363 struct btrfs_trans_handle *trans = NULL;
5364 struct btrfs_path *left_path = NULL; 5367 struct btrfs_path *left_path = NULL;
5365 struct btrfs_path *right_path = NULL; 5368 struct btrfs_path *right_path = NULL;
5366 struct btrfs_key left_key; 5369 struct btrfs_key left_key;
@@ -5376,9 +5379,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5376 int advance_right; 5379 int advance_right;
5377 u64 left_blockptr; 5380 u64 left_blockptr;
5378 u64 right_blockptr; 5381 u64 right_blockptr;
5379 u64 left_start_ctransid; 5382 u64 left_gen;
5380 u64 right_start_ctransid; 5383 u64 right_gen;
5381 u64 ctransid;
5382 5384
5383 left_path = btrfs_alloc_path(); 5385 left_path = btrfs_alloc_path();
5384 if (!left_path) { 5386 if (!left_path) {
@@ -5402,21 +5404,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5402 right_path->search_commit_root = 1; 5404 right_path->search_commit_root = 1;
5403 right_path->skip_locking = 1; 5405 right_path->skip_locking = 1;
5404 5406
5405 spin_lock(&left_root->root_item_lock);
5406 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5407 spin_unlock(&left_root->root_item_lock);
5408
5409 spin_lock(&right_root->root_item_lock);
5410 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5411 spin_unlock(&right_root->root_item_lock);
5412
5413 trans = btrfs_join_transaction(left_root);
5414 if (IS_ERR(trans)) {
5415 ret = PTR_ERR(trans);
5416 trans = NULL;
5417 goto out;
5418 }
5419
5420 /* 5407 /*
5421 * Strategy: Go to the first items of both trees. Then do 5408 * Strategy: Go to the first items of both trees. Then do
5422 * 5409 *
@@ -5453,6 +5440,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5453 * the right if possible or go up and right. 5440 * the right if possible or go up and right.
5454 */ 5441 */
5455 5442
5443 down_read(&left_root->fs_info->commit_root_sem);
5456 left_level = btrfs_header_level(left_root->commit_root); 5444 left_level = btrfs_header_level(left_root->commit_root);
5457 left_root_level = left_level; 5445 left_root_level = left_level;
5458 left_path->nodes[left_level] = left_root->commit_root; 5446 left_path->nodes[left_level] = left_root->commit_root;
@@ -5462,6 +5450,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5462 right_root_level = right_level; 5450 right_root_level = right_level;
5463 right_path->nodes[right_level] = right_root->commit_root; 5451 right_path->nodes[right_level] = right_root->commit_root;
5464 extent_buffer_get(right_path->nodes[right_level]); 5452 extent_buffer_get(right_path->nodes[right_level]);
5453 up_read(&left_root->fs_info->commit_root_sem);
5465 5454
5466 if (left_level == 0) 5455 if (left_level == 0)
5467 btrfs_item_key_to_cpu(left_path->nodes[left_level], 5456 btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5480,67 +5469,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5480 advance_left = advance_right = 0; 5469 advance_left = advance_right = 0;
5481 5470
5482 while (1) { 5471 while (1) {
5483 /*
5484 * We need to make sure the transaction does not get committed
5485 * while we do anything on commit roots. This means, we need to
5486 * join and leave transactions for every item that we process.
5487 */
5488 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5489 btrfs_release_path(left_path);
5490 btrfs_release_path(right_path);
5491
5492 ret = btrfs_end_transaction(trans, left_root);
5493 trans = NULL;
5494 if (ret < 0)
5495 goto out;
5496 }
5497 /* now rejoin the transaction */
5498 if (!trans) {
5499 trans = btrfs_join_transaction(left_root);
5500 if (IS_ERR(trans)) {
5501 ret = PTR_ERR(trans);
5502 trans = NULL;
5503 goto out;
5504 }
5505
5506 spin_lock(&left_root->root_item_lock);
5507 ctransid = btrfs_root_ctransid(&left_root->root_item);
5508 spin_unlock(&left_root->root_item_lock);
5509 if (ctransid != left_start_ctransid)
5510 left_start_ctransid = 0;
5511
5512 spin_lock(&right_root->root_item_lock);
5513 ctransid = btrfs_root_ctransid(&right_root->root_item);
5514 spin_unlock(&right_root->root_item_lock);
5515 if (ctransid != right_start_ctransid)
5516 right_start_ctransid = 0;
5517
5518 if (!left_start_ctransid || !right_start_ctransid) {
5519 WARN(1, KERN_WARNING
5520 "BTRFS: btrfs_compare_tree detected "
5521 "a change in one of the trees while "
5522 "iterating. This is probably a "
5523 "bug.\n");
5524 ret = -EIO;
5525 goto out;
5526 }
5527
5528 /*
5529 * the commit root may have changed, so start again
5530 * where we stopped
5531 */
5532 left_path->lowest_level = left_level;
5533 right_path->lowest_level = right_level;
5534 ret = btrfs_search_slot(NULL, left_root,
5535 &left_key, left_path, 0, 0);
5536 if (ret < 0)
5537 goto out;
5538 ret = btrfs_search_slot(NULL, right_root,
5539 &right_key, right_path, 0, 0);
5540 if (ret < 0)
5541 goto out;
5542 }
5543
5544 if (advance_left && !left_end_reached) { 5472 if (advance_left && !left_end_reached) {
5545 ret = tree_advance(left_root, left_path, &left_level, 5473 ret = tree_advance(left_root, left_path, &left_level,
5546 left_root_level, 5474 left_root_level,
@@ -5640,7 +5568,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5640 right_blockptr = btrfs_node_blockptr( 5568 right_blockptr = btrfs_node_blockptr(
5641 right_path->nodes[right_level], 5569 right_path->nodes[right_level],
5642 right_path->slots[right_level]); 5570 right_path->slots[right_level]);
5643 if (left_blockptr == right_blockptr) { 5571 left_gen = btrfs_node_ptr_generation(
5572 left_path->nodes[left_level],
5573 left_path->slots[left_level]);
5574 right_gen = btrfs_node_ptr_generation(
5575 right_path->nodes[right_level],
5576 right_path->slots[right_level]);
5577 if (left_blockptr == right_blockptr &&
5578 left_gen == right_gen) {
5644 /* 5579 /*
5645 * As we're on a shared block, don't 5580 * As we're on a shared block, don't
5646 * allow to go deeper. 5581 * allow to go deeper.
@@ -5663,14 +5598,6 @@ out:
5663 btrfs_free_path(left_path); 5598 btrfs_free_path(left_path);
5664 btrfs_free_path(right_path); 5599 btrfs_free_path(right_path);
5665 kfree(tmp_buf); 5600 kfree(tmp_buf);
5666
5667 if (trans) {
5668 if (!ret)
5669 ret = btrfs_end_transaction(trans, left_root);
5670 else
5671 btrfs_end_transaction(trans, left_root);
5672 }
5673
5674 return ret; 5601 return ret;
5675} 5602}
5676 5603
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..4c48df572bd6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
351#define BTRFS_FS_STATE_ERROR 0 351#define BTRFS_FS_STATE_ERROR 0
352#define BTRFS_FS_STATE_REMOUNTING 1 352#define BTRFS_FS_STATE_REMOUNTING 1
353#define BTRFS_FS_STATE_TRANS_ABORTED 2 353#define BTRFS_FS_STATE_TRANS_ABORTED 2
354#define BTRFS_FS_STATE_DEV_REPLACING 3
354 355
355/* Super block flags */ 356/* Super block flags */
356/* Errors detected */ 357/* Errors detected */
@@ -608,6 +609,7 @@ struct btrfs_path {
608 unsigned int skip_locking:1; 609 unsigned int skip_locking:1;
609 unsigned int leave_spinning:1; 610 unsigned int leave_spinning:1;
610 unsigned int search_commit_root:1; 611 unsigned int search_commit_root:1;
612 unsigned int need_commit_sem:1;
611}; 613};
612 614
613/* 615/*
@@ -985,7 +987,8 @@ struct btrfs_dev_replace_item {
985#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 987#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
986#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) 988#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
987#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) 989#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
988#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 990#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
991 BTRFS_SPACE_INFO_GLOBAL_RSV)
989 992
990enum btrfs_raid_types { 993enum btrfs_raid_types {
991 BTRFS_RAID_RAID10, 994 BTRFS_RAID_RAID10,
@@ -1017,6 +1020,12 @@ enum btrfs_raid_types {
1017 */ 1020 */
1018#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) 1021#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
1019 1022
1023/*
1024 * A fake block group type that is used to communicate global block reserve
1025 * size to userspace via the SPACE_INFO ioctl.
1026 */
1027#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
1028
1020#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ 1029#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
1021 BTRFS_AVAIL_ALLOC_BIT_SINGLE) 1030 BTRFS_AVAIL_ALLOC_BIT_SINGLE)
1022 1031
@@ -1439,7 +1448,7 @@ struct btrfs_fs_info {
1439 */ 1448 */
1440 struct mutex ordered_extent_flush_mutex; 1449 struct mutex ordered_extent_flush_mutex;
1441 1450
1442 struct rw_semaphore extent_commit_sem; 1451 struct rw_semaphore commit_root_sem;
1443 1452
1444 struct rw_semaphore cleanup_work_sem; 1453 struct rw_semaphore cleanup_work_sem;
1445 1454
@@ -1489,6 +1498,7 @@ struct btrfs_fs_info {
1489 */ 1498 */
1490 struct list_head ordered_roots; 1499 struct list_head ordered_roots;
1491 1500
1501 struct mutex delalloc_root_mutex;
1492 spinlock_t delalloc_root_lock; 1502 spinlock_t delalloc_root_lock;
1493 /* all fs/file tree roots that have delalloc inodes. */ 1503 /* all fs/file tree roots that have delalloc inodes. */
1494 struct list_head delalloc_roots; 1504 struct list_head delalloc_roots;
@@ -1503,28 +1513,27 @@ struct btrfs_fs_info {
1503 * A third pool does submit_bio to avoid deadlocking with the other 1513 * A third pool does submit_bio to avoid deadlocking with the other
1504 * two 1514 * two
1505 */ 1515 */
1506 struct btrfs_workers generic_worker; 1516 struct btrfs_workqueue *workers;
1507 struct btrfs_workers workers; 1517 struct btrfs_workqueue *delalloc_workers;
1508 struct btrfs_workers delalloc_workers; 1518 struct btrfs_workqueue *flush_workers;
1509 struct btrfs_workers flush_workers; 1519 struct btrfs_workqueue *endio_workers;
1510 struct btrfs_workers endio_workers; 1520 struct btrfs_workqueue *endio_meta_workers;
1511 struct btrfs_workers endio_meta_workers; 1521 struct btrfs_workqueue *endio_raid56_workers;
1512 struct btrfs_workers endio_raid56_workers; 1522 struct btrfs_workqueue *rmw_workers;
1513 struct btrfs_workers rmw_workers; 1523 struct btrfs_workqueue *endio_meta_write_workers;
1514 struct btrfs_workers endio_meta_write_workers; 1524 struct btrfs_workqueue *endio_write_workers;
1515 struct btrfs_workers endio_write_workers; 1525 struct btrfs_workqueue *endio_freespace_worker;
1516 struct btrfs_workers endio_freespace_worker; 1526 struct btrfs_workqueue *submit_workers;
1517 struct btrfs_workers submit_workers; 1527 struct btrfs_workqueue *caching_workers;
1518 struct btrfs_workers caching_workers; 1528 struct btrfs_workqueue *readahead_workers;
1519 struct btrfs_workers readahead_workers;
1520 1529
1521 /* 1530 /*
1522 * fixup workers take dirty pages that didn't properly go through 1531 * fixup workers take dirty pages that didn't properly go through
1523 * the cow mechanism and make them safe to write. It happens 1532 * the cow mechanism and make them safe to write. It happens
1524 * for the sys_munmap function call path 1533 * for the sys_munmap function call path
1525 */ 1534 */
1526 struct btrfs_workers fixup_workers; 1535 struct btrfs_workqueue *fixup_workers;
1527 struct btrfs_workers delayed_workers; 1536 struct btrfs_workqueue *delayed_workers;
1528 struct task_struct *transaction_kthread; 1537 struct task_struct *transaction_kthread;
1529 struct task_struct *cleaner_kthread; 1538 struct task_struct *cleaner_kthread;
1530 int thread_pool_size; 1539 int thread_pool_size;
@@ -1604,9 +1613,9 @@ struct btrfs_fs_info {
1604 atomic_t scrub_cancel_req; 1613 atomic_t scrub_cancel_req;
1605 wait_queue_head_t scrub_pause_wait; 1614 wait_queue_head_t scrub_pause_wait;
1606 int scrub_workers_refcnt; 1615 int scrub_workers_refcnt;
1607 struct btrfs_workers scrub_workers; 1616 struct btrfs_workqueue *scrub_workers;
1608 struct btrfs_workers scrub_wr_completion_workers; 1617 struct btrfs_workqueue *scrub_wr_completion_workers;
1609 struct btrfs_workers scrub_nocow_workers; 1618 struct btrfs_workqueue *scrub_nocow_workers;
1610 1619
1611#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1620#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1612 u32 check_integrity_print_mask; 1621 u32 check_integrity_print_mask;
@@ -1647,7 +1656,7 @@ struct btrfs_fs_info {
1647 /* qgroup rescan items */ 1656 /* qgroup rescan items */
1648 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1657 struct mutex qgroup_rescan_lock; /* protects the progress item */
1649 struct btrfs_key qgroup_rescan_progress; 1658 struct btrfs_key qgroup_rescan_progress;
1650 struct btrfs_workers qgroup_rescan_workers; 1659 struct btrfs_workqueue *qgroup_rescan_workers;
1651 struct completion qgroup_rescan_completion; 1660 struct completion qgroup_rescan_completion;
1652 struct btrfs_work qgroup_rescan_work; 1661 struct btrfs_work qgroup_rescan_work;
1653 1662
@@ -1674,10 +1683,18 @@ struct btrfs_fs_info {
1674 1683
1675 atomic_t mutually_exclusive_operation_running; 1684 atomic_t mutually_exclusive_operation_running;
1676 1685
1686 struct percpu_counter bio_counter;
1687 wait_queue_head_t replace_wait;
1688
1677 struct semaphore uuid_tree_rescan_sem; 1689 struct semaphore uuid_tree_rescan_sem;
1678 unsigned int update_uuid_tree_gen:1; 1690 unsigned int update_uuid_tree_gen:1;
1679}; 1691};
1680 1692
1693struct btrfs_subvolume_writers {
1694 struct percpu_counter counter;
1695 wait_queue_head_t wait;
1696};
1697
1681/* 1698/*
1682 * in ram representation of the tree. extent_root is used for all allocations 1699 * in ram representation of the tree. extent_root is used for all allocations
1683 * and for the extent tree extent_root root. 1700 * and for the extent tree extent_root root.
@@ -1702,7 +1719,6 @@ struct btrfs_root {
1702 struct btrfs_block_rsv *block_rsv; 1719 struct btrfs_block_rsv *block_rsv;
1703 1720
1704 /* free ino cache stuff */ 1721 /* free ino cache stuff */
1705 struct mutex fs_commit_mutex;
1706 struct btrfs_free_space_ctl *free_ino_ctl; 1722 struct btrfs_free_space_ctl *free_ino_ctl;
1707 enum btrfs_caching_type cached; 1723 enum btrfs_caching_type cached;
1708 spinlock_t cache_lock; 1724 spinlock_t cache_lock;
@@ -1714,11 +1730,15 @@ struct btrfs_root {
1714 struct mutex log_mutex; 1730 struct mutex log_mutex;
1715 wait_queue_head_t log_writer_wait; 1731 wait_queue_head_t log_writer_wait;
1716 wait_queue_head_t log_commit_wait[2]; 1732 wait_queue_head_t log_commit_wait[2];
1733 struct list_head log_ctxs[2];
1717 atomic_t log_writers; 1734 atomic_t log_writers;
1718 atomic_t log_commit[2]; 1735 atomic_t log_commit[2];
1719 atomic_t log_batch; 1736 atomic_t log_batch;
1720 unsigned long log_transid; 1737 int log_transid;
1721 unsigned long last_log_commit; 1738 /* No matter the commit succeeds or not*/
1739 int log_transid_committed;
1740 /* Just be updated when the commit succeeds. */
1741 int last_log_commit;
1722 pid_t log_start_pid; 1742 pid_t log_start_pid;
1723 bool log_multiple_pids; 1743 bool log_multiple_pids;
1724 1744
@@ -1793,6 +1813,7 @@ struct btrfs_root {
1793 spinlock_t root_item_lock; 1813 spinlock_t root_item_lock;
1794 atomic_t refs; 1814 atomic_t refs;
1795 1815
1816 struct mutex delalloc_mutex;
1796 spinlock_t delalloc_lock; 1817 spinlock_t delalloc_lock;
1797 /* 1818 /*
1798 * all of the inodes that have delalloc bytes. It is possible for 1819 * all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1823,8 @@ struct btrfs_root {
1802 struct list_head delalloc_inodes; 1823 struct list_head delalloc_inodes;
1803 struct list_head delalloc_root; 1824 struct list_head delalloc_root;
1804 u64 nr_delalloc_inodes; 1825 u64 nr_delalloc_inodes;
1826
1827 struct mutex ordered_extent_mutex;
1805 /* 1828 /*
1806 * this is used by the balancing code to wait for all the pending 1829 * this is used by the balancing code to wait for all the pending
1807 * ordered extents 1830 * ordered extents
@@ -1822,6 +1845,8 @@ struct btrfs_root {
1822 * manipulation with the read-only status via SUBVOL_SETFLAGS 1845 * manipulation with the read-only status via SUBVOL_SETFLAGS
1823 */ 1846 */
1824 int send_in_progress; 1847 int send_in_progress;
1848 struct btrfs_subvolume_writers *subv_writers;
1849 atomic_t will_be_snapshoted;
1825}; 1850};
1826 1851
1827struct btrfs_ioctl_defrag_range_args { 1852struct btrfs_ioctl_defrag_range_args {
@@ -3346,6 +3371,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3346int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3371int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3347 struct btrfs_fs_info *fs_info); 3372 struct btrfs_fs_info *fs_info);
3348int __get_raid_index(u64 flags); 3373int __get_raid_index(u64 flags);
3374
3375int btrfs_start_nocow_write(struct btrfs_root *root);
3376void btrfs_end_nocow_write(struct btrfs_root *root);
3349/* ctree.c */ 3377/* ctree.c */
3350int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3378int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3351 int level, int *slot); 3379 int level, int *slot);
@@ -3723,7 +3751,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3723 u32 min_type); 3751 u32 min_type);
3724 3752
3725int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3753int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3726int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); 3754int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
3755 int nr);
3727int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3756int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3728 struct extent_state **cached_state); 3757 struct extent_state **cached_state);
3729int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3758int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4034,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
4005int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 4034int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4006 struct btrfs_scrub_progress *progress); 4035 struct btrfs_scrub_progress *progress);
4007 4036
4037/* dev-replace.c */
4038void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4039void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4040void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
4041
4008/* reada.c */ 4042/* reada.c */
4009struct reada_control { 4043struct reada_control {
4010 struct btrfs_root *root; /* tree to prefetch */ 4044 struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1392 return -ENOMEM; 1392 return -ENOMEM;
1393 1393
1394 async_work->delayed_root = delayed_root; 1394 async_work->delayed_root = delayed_root;
1395 async_work->work.func = btrfs_async_run_delayed_root; 1395 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
1396 async_work->work.flags = 0; 1396 NULL, NULL);
1397 async_work->nr = nr; 1397 async_work->nr = nr;
1398 1398
1399 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); 1399 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
1400 return 0; 1400 return 0;
1401} 1401}
1402 1402
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
199 */ 199 */
200static struct btrfs_delayed_ref_head * 200static struct btrfs_delayed_ref_head *
201find_ref_head(struct rb_root *root, u64 bytenr, 201find_ref_head(struct rb_root *root, u64 bytenr,
202 struct btrfs_delayed_ref_head **last, int return_bigger) 202 int return_bigger)
203{ 203{
204 struct rb_node *n; 204 struct rb_node *n;
205 struct btrfs_delayed_ref_head *entry; 205 struct btrfs_delayed_ref_head *entry;
206 int cmp = 0;
207 206
208again:
209 n = root->rb_node; 207 n = root->rb_node;
210 entry = NULL; 208 entry = NULL;
211 while (n) { 209 while (n) {
212 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 210 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
213 if (last)
214 *last = entry;
215 211
216 if (bytenr < entry->node.bytenr) 212 if (bytenr < entry->node.bytenr)
217 cmp = -1;
218 else if (bytenr > entry->node.bytenr)
219 cmp = 1;
220 else
221 cmp = 0;
222
223 if (cmp < 0)
224 n = n->rb_left; 213 n = n->rb_left;
225 else if (cmp > 0) 214 else if (bytenr > entry->node.bytenr)
226 n = n->rb_right; 215 n = n->rb_right;
227 else 216 else
228 return entry; 217 return entry;
229 } 218 }
230 if (entry && return_bigger) { 219 if (entry && return_bigger) {
231 if (cmp > 0) { 220 if (bytenr > entry->node.bytenr) {
232 n = rb_next(&entry->href_node); 221 n = rb_next(&entry->href_node);
233 if (!n) 222 if (!n)
234 n = rb_first(root); 223 n = rb_first(root);
235 entry = rb_entry(n, struct btrfs_delayed_ref_head, 224 entry = rb_entry(n, struct btrfs_delayed_ref_head,
236 href_node); 225 href_node);
237 bytenr = entry->node.bytenr; 226 return entry;
238 return_bigger = 0;
239 goto again;
240 } 227 }
241 return entry; 228 return entry;
242 } 229 }
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
415 402
416again: 403again:
417 start = delayed_refs->run_delayed_start; 404 start = delayed_refs->run_delayed_start;
418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 405 head = find_ref_head(&delayed_refs->href_root, start, 1);
419 if (!head && !loop) { 406 if (!head && !loop) {
420 delayed_refs->run_delayed_start = 0; 407 delayed_refs->run_delayed_start = 0;
421 start = 0; 408 start = 0;
422 loop = true; 409 loop = true;
423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 410 head = find_ref_head(&delayed_refs->href_root, start, 1);
424 if (!head) 411 if (!head)
425 return NULL; 412 return NULL;
426 } else if (!head && loop) { 413 } else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
508 ref = btrfs_delayed_node_to_head(update); 495 ref = btrfs_delayed_node_to_head(update);
509 BUG_ON(existing_ref->is_data != ref->is_data); 496 BUG_ON(existing_ref->is_data != ref->is_data);
510 497
498 spin_lock(&existing_ref->lock);
511 if (ref->must_insert_reserved) { 499 if (ref->must_insert_reserved) {
512 /* if the extent was freed and then 500 /* if the extent was freed and then
513 * reallocated before the delayed ref 501 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
549 * only need the lock for this case cause we could be processing it 537 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok. 538 * currently, for refs we just added we know we're a-ok.
551 */ 539 */
552 spin_lock(&existing_ref->lock);
553 existing->ref_mod += update->ref_mod; 540 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock); 541 spin_unlock(&existing_ref->lock);
555} 542}
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
898 struct btrfs_delayed_ref_root *delayed_refs; 885 struct btrfs_delayed_ref_root *delayed_refs;
899 886
900 delayed_refs = &trans->transaction->delayed_refs; 887 delayed_refs = &trans->transaction->delayed_refs;
901 return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); 888 return find_ref_head(&delayed_refs->href_root, bytenr, 0);
902} 889}
903 890
904void btrfs_delayed_ref_exit(void) 891void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
431 return ret; 431 return ret;
432} 432}
433 433
434/*
435 * blocked until all flighting bios are finished.
436 */
437static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
438{
439 s64 writers;
440 DEFINE_WAIT(wait);
441
442 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
443 do {
444 prepare_to_wait(&fs_info->replace_wait, &wait,
445 TASK_UNINTERRUPTIBLE);
446 writers = percpu_counter_sum(&fs_info->bio_counter);
447 if (writers)
448 schedule();
449 finish_wait(&fs_info->replace_wait, &wait);
450 } while (writers);
451}
452
453/*
454 * we have removed target device, it is safe to allow new bios request.
455 */
456static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
457{
458 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
459 if (waitqueue_active(&fs_info->replace_wait))
460 wake_up(&fs_info->replace_wait);
461}
462
434static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 463static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
435 int scrub_ret) 464 int scrub_ret)
436{ 465{
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
458 src_device = dev_replace->srcdev; 487 src_device = dev_replace->srcdev;
459 btrfs_dev_replace_unlock(dev_replace); 488 btrfs_dev_replace_unlock(dev_replace);
460 489
461 /* replace old device with new one in mapping tree */
462 if (!scrub_ret)
463 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
464 src_device,
465 tgt_device);
466
467 /* 490 /*
468 * flush all outstanding I/O and inode extent mappings before the 491 * flush all outstanding I/O and inode extent mappings before the
469 * copy operation is declared as being finished 492 * copy operation is declared as being finished
470 */ 493 */
471 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 494 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
472 if (ret) { 495 if (ret) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 496 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return ret; 497 return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
484 WARN_ON(ret); 507 WARN_ON(ret);
485 508
486 /* keep away write_all_supers() during the finishing procedure */ 509 /* keep away write_all_supers() during the finishing procedure */
510 mutex_lock(&root->fs_info->chunk_mutex);
487 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 511 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
488 btrfs_dev_replace_lock(dev_replace); 512 btrfs_dev_replace_lock(dev_replace);
489 dev_replace->replace_state = 513 dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
494 dev_replace->time_stopped = get_seconds(); 518 dev_replace->time_stopped = get_seconds();
495 dev_replace->item_needs_writeback = 1; 519 dev_replace->item_needs_writeback = 1;
496 520
497 if (scrub_ret) { 521 /* replace old device with new one in mapping tree */
522 if (!scrub_ret) {
523 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
524 src_device,
525 tgt_device);
526 } else {
498 printk_in_rcu(KERN_ERR 527 printk_in_rcu(KERN_ERR
499 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 528 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
500 src_device->missing ? "<missing disk>" : 529 src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
503 rcu_str_deref(tgt_device->name), scrub_ret); 532 rcu_str_deref(tgt_device->name), scrub_ret);
504 btrfs_dev_replace_unlock(dev_replace); 533 btrfs_dev_replace_unlock(dev_replace);
505 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 534 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
535 mutex_unlock(&root->fs_info->chunk_mutex);
506 if (tgt_device) 536 if (tgt_device)
507 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 537 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
508 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 538 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 562 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
533 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
534 564
565 btrfs_rm_dev_replace_blocked(fs_info);
566
535 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 567 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
536 568
569 btrfs_rm_dev_replace_unblocked(fs_info);
570
537 /* 571 /*
538 * this is again a consistent state where no dev_replace procedure 572 * this is again a consistent state where no dev_replace procedure
539 * is running, the target device is part of the filesystem, the 573 * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
543 */ 577 */
544 btrfs_dev_replace_unlock(dev_replace); 578 btrfs_dev_replace_unlock(dev_replace);
545 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 579 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
580 mutex_unlock(&root->fs_info->chunk_mutex);
546 581
547 /* write back the superblocks */ 582 /* write back the superblocks */
548 trans = btrfs_start_transaction(root, 0); 583 trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
862 mutex_unlock(&dev_replace->lock_management_lock); 897 mutex_unlock(&dev_replace->lock_management_lock);
863 } 898 }
864} 899}
900
901void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
902{
903 percpu_counter_inc(&fs_info->bio_counter);
904}
905
906void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
907{
908 percpu_counter_dec(&fs_info->bio_counter);
909
910 if (waitqueue_active(&fs_info->replace_wait))
911 wake_up(&fs_info->replace_wait);
912}
913
914void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
915{
916 DEFINE_WAIT(wait);
917again:
918 percpu_counter_inc(&fs_info->bio_counter);
919 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
920 btrfs_bio_counter_dec(fs_info);
921 wait_event(fs_info->replace_wait,
922 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
923 &fs_info->fs_state));
924 goto again;
925 }
926
927}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..029d46c2e170 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -329,6 +329,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
329{ 329{
330 struct extent_state *cached_state = NULL; 330 struct extent_state *cached_state = NULL;
331 int ret; 331 int ret;
332 bool need_lock = (current->journal_info ==
333 (void *)BTRFS_SEND_TRANS_STUB);
332 334
333 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 335 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
334 return 0; 336 return 0;
@@ -336,6 +338,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
336 if (atomic) 338 if (atomic)
337 return -EAGAIN; 339 return -EAGAIN;
338 340
341 if (need_lock) {
342 btrfs_tree_read_lock(eb);
343 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
344 }
345
339 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 346 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
340 0, &cached_state); 347 0, &cached_state);
341 if (extent_buffer_uptodate(eb) && 348 if (extent_buffer_uptodate(eb) &&
@@ -347,10 +354,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
347 "found %llu\n", 354 "found %llu\n",
348 eb->start, parent_transid, btrfs_header_generation(eb)); 355 eb->start, parent_transid, btrfs_header_generation(eb));
349 ret = 1; 356 ret = 1;
350 clear_extent_buffer_uptodate(eb); 357
358 /*
359 * Things reading via commit roots that don't have normal protection,
360 * like send, can have a really old block in cache that may point at a
361 * block that has been free'd and re-allocated. So don't clear uptodate
362 * if we find an eb that is under IO (dirty/writeback) because we could
363 * end up reading in the stale data and then writing it back out and
364 * making everybody very sad.
365 */
366 if (!extent_buffer_under_io(eb))
367 clear_extent_buffer_uptodate(eb);
351out: 368out:
352 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, 369 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
353 &cached_state, GFP_NOFS); 370 &cached_state, GFP_NOFS);
371 btrfs_tree_read_unlock_blocking(eb);
354 return ret; 372 return ret;
355} 373}
356 374
@@ -678,32 +696,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
678 696
679 fs_info = end_io_wq->info; 697 fs_info = end_io_wq->info;
680 end_io_wq->error = err; 698 end_io_wq->error = err;
681 end_io_wq->work.func = end_workqueue_fn; 699 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
682 end_io_wq->work.flags = 0;
683 700
684 if (bio->bi_rw & REQ_WRITE) { 701 if (bio->bi_rw & REQ_WRITE) {
685 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 702 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
686 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 703 btrfs_queue_work(fs_info->endio_meta_write_workers,
687 &end_io_wq->work); 704 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 705 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
689 btrfs_queue_worker(&fs_info->endio_freespace_worker, 706 btrfs_queue_work(fs_info->endio_freespace_worker,
690 &end_io_wq->work); 707 &end_io_wq->work);
691 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 708 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
692 btrfs_queue_worker(&fs_info->endio_raid56_workers, 709 btrfs_queue_work(fs_info->endio_raid56_workers,
693 &end_io_wq->work); 710 &end_io_wq->work);
694 else 711 else
695 btrfs_queue_worker(&fs_info->endio_write_workers, 712 btrfs_queue_work(fs_info->endio_write_workers,
696 &end_io_wq->work); 713 &end_io_wq->work);
697 } else { 714 } else {
698 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 715 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
699 btrfs_queue_worker(&fs_info->endio_raid56_workers, 716 btrfs_queue_work(fs_info->endio_raid56_workers,
700 &end_io_wq->work); 717 &end_io_wq->work);
701 else if (end_io_wq->metadata) 718 else if (end_io_wq->metadata)
702 btrfs_queue_worker(&fs_info->endio_meta_workers, 719 btrfs_queue_work(fs_info->endio_meta_workers,
703 &end_io_wq->work); 720 &end_io_wq->work);
704 else 721 else
705 btrfs_queue_worker(&fs_info->endio_workers, 722 btrfs_queue_work(fs_info->endio_workers,
706 &end_io_wq->work); 723 &end_io_wq->work);
707 } 724 }
708} 725}
709 726
@@ -738,7 +755,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
738unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) 755unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
739{ 756{
740 unsigned long limit = min_t(unsigned long, 757 unsigned long limit = min_t(unsigned long,
741 info->workers.max_workers, 758 info->thread_pool_size,
742 info->fs_devices->open_devices); 759 info->fs_devices->open_devices);
743 return 256 * limit; 760 return 256 * limit;
744} 761}
@@ -811,11 +828,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
811 async->submit_bio_start = submit_bio_start; 828 async->submit_bio_start = submit_bio_start;
812 async->submit_bio_done = submit_bio_done; 829 async->submit_bio_done = submit_bio_done;
813 830
814 async->work.func = run_one_async_start; 831 btrfs_init_work(&async->work, run_one_async_start,
815 async->work.ordered_func = run_one_async_done; 832 run_one_async_done, run_one_async_free);
816 async->work.ordered_free = run_one_async_free;
817 833
818 async->work.flags = 0;
819 async->bio_flags = bio_flags; 834 async->bio_flags = bio_flags;
820 async->bio_offset = bio_offset; 835 async->bio_offset = bio_offset;
821 836
@@ -824,9 +839,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
824 atomic_inc(&fs_info->nr_async_submits); 839 atomic_inc(&fs_info->nr_async_submits);
825 840
826 if (rw & REQ_SYNC) 841 if (rw & REQ_SYNC)
827 btrfs_set_work_high_prio(&async->work); 842 btrfs_set_work_high_priority(&async->work);
828 843
829 btrfs_queue_worker(&fs_info->workers, &async->work); 844 btrfs_queue_work(fs_info->workers, &async->work);
830 845
831 while (atomic_read(&fs_info->async_submit_draining) && 846 while (atomic_read(&fs_info->async_submit_draining) &&
832 atomic_read(&fs_info->nr_async_submits)) { 847 atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1164,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1149 } 1164 }
1150} 1165}
1151 1166
1167static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1168{
1169 struct btrfs_subvolume_writers *writers;
1170 int ret;
1171
1172 writers = kmalloc(sizeof(*writers), GFP_NOFS);
1173 if (!writers)
1174 return ERR_PTR(-ENOMEM);
1175
1176 ret = percpu_counter_init(&writers->counter, 0);
1177 if (ret < 0) {
1178 kfree(writers);
1179 return ERR_PTR(ret);
1180 }
1181
1182 init_waitqueue_head(&writers->wait);
1183 return writers;
1184}
1185
1186static void
1187btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1188{
1189 percpu_counter_destroy(&writers->counter);
1190 kfree(writers);
1191}
1192
1152static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1193static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 u32 stripesize, struct btrfs_root *root, 1194 u32 stripesize, struct btrfs_root *root,
1154 struct btrfs_fs_info *fs_info, 1195 struct btrfs_fs_info *fs_info,
@@ -1194,16 +1235,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1194 spin_lock_init(&root->log_extents_lock[1]); 1235 spin_lock_init(&root->log_extents_lock[1]);
1195 mutex_init(&root->objectid_mutex); 1236 mutex_init(&root->objectid_mutex);
1196 mutex_init(&root->log_mutex); 1237 mutex_init(&root->log_mutex);
1238 mutex_init(&root->ordered_extent_mutex);
1239 mutex_init(&root->delalloc_mutex);
1197 init_waitqueue_head(&root->log_writer_wait); 1240 init_waitqueue_head(&root->log_writer_wait);
1198 init_waitqueue_head(&root->log_commit_wait[0]); 1241 init_waitqueue_head(&root->log_commit_wait[0]);
1199 init_waitqueue_head(&root->log_commit_wait[1]); 1242 init_waitqueue_head(&root->log_commit_wait[1]);
1243 INIT_LIST_HEAD(&root->log_ctxs[0]);
1244 INIT_LIST_HEAD(&root->log_ctxs[1]);
1200 atomic_set(&root->log_commit[0], 0); 1245 atomic_set(&root->log_commit[0], 0);
1201 atomic_set(&root->log_commit[1], 0); 1246 atomic_set(&root->log_commit[1], 0);
1202 atomic_set(&root->log_writers, 0); 1247 atomic_set(&root->log_writers, 0);
1203 atomic_set(&root->log_batch, 0); 1248 atomic_set(&root->log_batch, 0);
1204 atomic_set(&root->orphan_inodes, 0); 1249 atomic_set(&root->orphan_inodes, 0);
1205 atomic_set(&root->refs, 1); 1250 atomic_set(&root->refs, 1);
1251 atomic_set(&root->will_be_snapshoted, 0);
1206 root->log_transid = 0; 1252 root->log_transid = 0;
1253 root->log_transid_committed = -1;
1207 root->last_log_commit = 0; 1254 root->last_log_commit = 0;
1208 if (fs_info) 1255 if (fs_info)
1209 extent_io_tree_init(&root->dirty_log_pages, 1256 extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1464,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1417 WARN_ON(root->log_root); 1464 WARN_ON(root->log_root);
1418 root->log_root = log_root; 1465 root->log_root = log_root;
1419 root->log_transid = 0; 1466 root->log_transid = 0;
1467 root->log_transid_committed = -1;
1420 root->last_log_commit = 0; 1468 root->last_log_commit = 0;
1421 return 0; 1469 return 0;
1422} 1470}
@@ -1498,6 +1546,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1498int btrfs_init_fs_root(struct btrfs_root *root) 1546int btrfs_init_fs_root(struct btrfs_root *root)
1499{ 1547{
1500 int ret; 1548 int ret;
1549 struct btrfs_subvolume_writers *writers;
1501 1550
1502 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1551 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1503 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1552 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,15 +1556,24 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1507 goto fail; 1556 goto fail;
1508 } 1557 }
1509 1558
1559 writers = btrfs_alloc_subvolume_writers();
1560 if (IS_ERR(writers)) {
1561 ret = PTR_ERR(writers);
1562 goto fail;
1563 }
1564 root->subv_writers = writers;
1565
1510 btrfs_init_free_ino_ctl(root); 1566 btrfs_init_free_ino_ctl(root);
1511 mutex_init(&root->fs_commit_mutex);
1512 spin_lock_init(&root->cache_lock); 1567 spin_lock_init(&root->cache_lock);
1513 init_waitqueue_head(&root->cache_wait); 1568 init_waitqueue_head(&root->cache_wait);
1514 1569
1515 ret = get_anon_bdev(&root->anon_dev); 1570 ret = get_anon_bdev(&root->anon_dev);
1516 if (ret) 1571 if (ret)
1517 goto fail; 1572 goto free_writers;
1518 return 0; 1573 return 0;
1574
1575free_writers:
1576 btrfs_free_subvolume_writers(root->subv_writers);
1519fail: 1577fail:
1520 kfree(root->free_ino_ctl); 1578 kfree(root->free_ino_ctl);
1521 kfree(root->free_ino_pinned); 1579 kfree(root->free_ino_pinned);
@@ -1990,23 +2048,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
1990/* helper to cleanup workers */ 2048/* helper to cleanup workers */
1991static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) 2049static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1992{ 2050{
1993 btrfs_stop_workers(&fs_info->generic_worker); 2051 btrfs_destroy_workqueue(fs_info->fixup_workers);
1994 btrfs_stop_workers(&fs_info->fixup_workers); 2052 btrfs_destroy_workqueue(fs_info->delalloc_workers);
1995 btrfs_stop_workers(&fs_info->delalloc_workers); 2053 btrfs_destroy_workqueue(fs_info->workers);
1996 btrfs_stop_workers(&fs_info->workers); 2054 btrfs_destroy_workqueue(fs_info->endio_workers);
1997 btrfs_stop_workers(&fs_info->endio_workers); 2055 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
1998 btrfs_stop_workers(&fs_info->endio_meta_workers); 2056 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1999 btrfs_stop_workers(&fs_info->endio_raid56_workers); 2057 btrfs_destroy_workqueue(fs_info->rmw_workers);
2000 btrfs_stop_workers(&fs_info->rmw_workers); 2058 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2001 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2059 btrfs_destroy_workqueue(fs_info->endio_write_workers);
2002 btrfs_stop_workers(&fs_info->endio_write_workers); 2060 btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2003 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2061 btrfs_destroy_workqueue(fs_info->submit_workers);
2004 btrfs_stop_workers(&fs_info->submit_workers); 2062 btrfs_destroy_workqueue(fs_info->delayed_workers);
2005 btrfs_stop_workers(&fs_info->delayed_workers); 2063 btrfs_destroy_workqueue(fs_info->caching_workers);
2006 btrfs_stop_workers(&fs_info->caching_workers); 2064 btrfs_destroy_workqueue(fs_info->readahead_workers);
2007 btrfs_stop_workers(&fs_info->readahead_workers); 2065 btrfs_destroy_workqueue(fs_info->flush_workers);
2008 btrfs_stop_workers(&fs_info->flush_workers); 2066 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2009 btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
2010} 2067}
2011 2068
2012static void free_root_extent_buffers(struct btrfs_root *root) 2069static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2154,8 @@ int open_ctree(struct super_block *sb,
2097 int err = -EINVAL; 2154 int err = -EINVAL;
2098 int num_backups_tried = 0; 2155 int num_backups_tried = 0;
2099 int backup_index = 0; 2156 int backup_index = 0;
2157 int max_active;
2158 int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2100 bool create_uuid_tree; 2159 bool create_uuid_tree;
2101 bool check_uuid_tree; 2160 bool check_uuid_tree;
2102 2161
@@ -2133,10 +2192,16 @@ int open_ctree(struct super_block *sb,
2133 goto fail_dirty_metadata_bytes; 2192 goto fail_dirty_metadata_bytes;
2134 } 2193 }
2135 2194
2195 ret = percpu_counter_init(&fs_info->bio_counter, 0);
2196 if (ret) {
2197 err = ret;
2198 goto fail_delalloc_bytes;
2199 }
2200
2136 fs_info->btree_inode = new_inode(sb); 2201 fs_info->btree_inode = new_inode(sb);
2137 if (!fs_info->btree_inode) { 2202 if (!fs_info->btree_inode) {
2138 err = -ENOMEM; 2203 err = -ENOMEM;
2139 goto fail_delalloc_bytes; 2204 goto fail_bio_counter;
2140 } 2205 }
2141 2206
2142 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2207 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2224,7 @@ int open_ctree(struct super_block *sb,
2159 spin_lock_init(&fs_info->buffer_lock); 2224 spin_lock_init(&fs_info->buffer_lock);
2160 rwlock_init(&fs_info->tree_mod_log_lock); 2225 rwlock_init(&fs_info->tree_mod_log_lock);
2161 mutex_init(&fs_info->reloc_mutex); 2226 mutex_init(&fs_info->reloc_mutex);
2227 mutex_init(&fs_info->delalloc_root_mutex);
2162 seqlock_init(&fs_info->profiles_lock); 2228 seqlock_init(&fs_info->profiles_lock);
2163 2229
2164 init_completion(&fs_info->kobj_unregister); 2230 init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2277,7 @@ int open_ctree(struct super_block *sb,
2211 atomic_set(&fs_info->scrub_pause_req, 0); 2277 atomic_set(&fs_info->scrub_pause_req, 0);
2212 atomic_set(&fs_info->scrubs_paused, 0); 2278 atomic_set(&fs_info->scrubs_paused, 0);
2213 atomic_set(&fs_info->scrub_cancel_req, 0); 2279 atomic_set(&fs_info->scrub_cancel_req, 0);
2280 init_waitqueue_head(&fs_info->replace_wait);
2214 init_waitqueue_head(&fs_info->scrub_pause_wait); 2281 init_waitqueue_head(&fs_info->scrub_pause_wait);
2215 fs_info->scrub_workers_refcnt = 0; 2282 fs_info->scrub_workers_refcnt = 0;
2216#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2283#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2274,7 +2341,7 @@ int open_ctree(struct super_block *sb,
2274 mutex_init(&fs_info->transaction_kthread_mutex); 2341 mutex_init(&fs_info->transaction_kthread_mutex);
2275 mutex_init(&fs_info->cleaner_mutex); 2342 mutex_init(&fs_info->cleaner_mutex);
2276 mutex_init(&fs_info->volume_mutex); 2343 mutex_init(&fs_info->volume_mutex);
2277 init_rwsem(&fs_info->extent_commit_sem); 2344 init_rwsem(&fs_info->commit_root_sem);
2278 init_rwsem(&fs_info->cleanup_work_sem); 2345 init_rwsem(&fs_info->cleanup_work_sem);
2279 init_rwsem(&fs_info->subvol_sem); 2346 init_rwsem(&fs_info->subvol_sem);
2280 sema_init(&fs_info->uuid_tree_rescan_sem, 1); 2347 sema_init(&fs_info->uuid_tree_rescan_sem, 1);
@@ -2458,104 +2525,68 @@ int open_ctree(struct super_block *sb,
2458 goto fail_alloc; 2525 goto fail_alloc;
2459 } 2526 }
2460 2527
2461 btrfs_init_workers(&fs_info->generic_worker, 2528 max_active = fs_info->thread_pool_size;
2462 "genwork", 1, NULL);
2463
2464 btrfs_init_workers(&fs_info->workers, "worker",
2465 fs_info->thread_pool_size,
2466 &fs_info->generic_worker);
2467 2529
2468 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 2530 fs_info->workers =
2469 fs_info->thread_pool_size, NULL); 2531 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2532 max_active, 16);
2470 2533
2471 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2534 fs_info->delalloc_workers =
2472 fs_info->thread_pool_size, NULL); 2535 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2473 2536
2474 btrfs_init_workers(&fs_info->submit_workers, "submit", 2537 fs_info->flush_workers =
2475 min_t(u64, fs_devices->num_devices, 2538 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2476 fs_info->thread_pool_size), NULL);
2477 2539
2478 btrfs_init_workers(&fs_info->caching_workers, "cache", 2540 fs_info->caching_workers =
2479 fs_info->thread_pool_size, NULL); 2541 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2480 2542
2481 /* a higher idle thresh on the submit workers makes it much more 2543 /*
2544 * a higher idle thresh on the submit workers makes it much more
2482 * likely that bios will be send down in a sane order to the 2545 * likely that bios will be send down in a sane order to the
2483 * devices 2546 * devices
2484 */ 2547 */
2485 fs_info->submit_workers.idle_thresh = 64; 2548 fs_info->submit_workers =
2486 2549 btrfs_alloc_workqueue("submit", flags,
2487 fs_info->workers.idle_thresh = 16; 2550 min_t(u64, fs_devices->num_devices,
2488 fs_info->workers.ordered = 1; 2551 max_active), 64);
2489 2552
2490 fs_info->delalloc_workers.idle_thresh = 2; 2553 fs_info->fixup_workers =
2491 fs_info->delalloc_workers.ordered = 1; 2554 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2492
2493 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2494 &fs_info->generic_worker);
2495 btrfs_init_workers(&fs_info->endio_workers, "endio",
2496 fs_info->thread_pool_size,
2497 &fs_info->generic_worker);
2498 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2499 fs_info->thread_pool_size,
2500 &fs_info->generic_worker);
2501 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2502 "endio-meta-write", fs_info->thread_pool_size,
2503 &fs_info->generic_worker);
2504 btrfs_init_workers(&fs_info->endio_raid56_workers,
2505 "endio-raid56", fs_info->thread_pool_size,
2506 &fs_info->generic_worker);
2507 btrfs_init_workers(&fs_info->rmw_workers,
2508 "rmw", fs_info->thread_pool_size,
2509 &fs_info->generic_worker);
2510 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2511 fs_info->thread_pool_size,
2512 &fs_info->generic_worker);
2513 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2514 1, &fs_info->generic_worker);
2515 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2516 fs_info->thread_pool_size,
2517 &fs_info->generic_worker);
2518 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2519 fs_info->thread_pool_size,
2520 &fs_info->generic_worker);
2521 btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
2522 &fs_info->generic_worker);
2523 2555
2524 /* 2556 /*
2525 * endios are largely parallel and should have a very 2557 * endios are largely parallel and should have a very
2526 * low idle thresh 2558 * low idle thresh
2527 */ 2559 */
2528 fs_info->endio_workers.idle_thresh = 4; 2560 fs_info->endio_workers =
2529 fs_info->endio_meta_workers.idle_thresh = 4; 2561 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2530 fs_info->endio_raid56_workers.idle_thresh = 4; 2562 fs_info->endio_meta_workers =
2531 fs_info->rmw_workers.idle_thresh = 2; 2563 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2532 2564 fs_info->endio_meta_write_workers =
2533 fs_info->endio_write_workers.idle_thresh = 2; 2565 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2534 fs_info->endio_meta_write_workers.idle_thresh = 2; 2566 fs_info->endio_raid56_workers =
2535 fs_info->readahead_workers.idle_thresh = 2; 2567 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2536 2568 fs_info->rmw_workers =
2537 /* 2569 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2538 * btrfs_start_workers can really only fail because of ENOMEM so just 2570 fs_info->endio_write_workers =
2539 * return -ENOMEM if any of these fail. 2571 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2540 */ 2572 fs_info->endio_freespace_worker =
2541 ret = btrfs_start_workers(&fs_info->workers); 2573 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2542 ret |= btrfs_start_workers(&fs_info->generic_worker); 2574 fs_info->delayed_workers =
2543 ret |= btrfs_start_workers(&fs_info->submit_workers); 2575 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2544 ret |= btrfs_start_workers(&fs_info->delalloc_workers); 2576 fs_info->readahead_workers =
2545 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2577 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2546 ret |= btrfs_start_workers(&fs_info->endio_workers); 2578 fs_info->qgroup_rescan_workers =
2547 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2579 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2548 ret |= btrfs_start_workers(&fs_info->rmw_workers); 2580
2549 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2581 if (!(fs_info->workers && fs_info->delalloc_workers &&
2550 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2582 fs_info->submit_workers && fs_info->flush_workers &&
2551 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2583 fs_info->endio_workers && fs_info->endio_meta_workers &&
2552 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2584 fs_info->endio_meta_write_workers &&
2553 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2585 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2554 ret |= btrfs_start_workers(&fs_info->caching_workers); 2586 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2555 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2587 fs_info->caching_workers && fs_info->readahead_workers &&
2556 ret |= btrfs_start_workers(&fs_info->flush_workers); 2588 fs_info->fixup_workers && fs_info->delayed_workers &&
2557 ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); 2589 fs_info->qgroup_rescan_workers)) {
2558 if (ret) {
2559 err = -ENOMEM; 2590 err = -ENOMEM;
2560 goto fail_sb_buffer; 2591 goto fail_sb_buffer;
2561 } 2592 }
@@ -2963,6 +2994,8 @@ fail_iput:
2963 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2994 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2964 2995
2965 iput(fs_info->btree_inode); 2996 iput(fs_info->btree_inode);
2997fail_bio_counter:
2998 percpu_counter_destroy(&fs_info->bio_counter);
2966fail_delalloc_bytes: 2999fail_delalloc_bytes:
2967 percpu_counter_destroy(&fs_info->delalloc_bytes); 3000 percpu_counter_destroy(&fs_info->delalloc_bytes);
2968fail_dirty_metadata_bytes: 3001fail_dirty_metadata_bytes:
@@ -3244,6 +3277,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3244 /* send down all the barriers */ 3277 /* send down all the barriers */
3245 head = &info->fs_devices->devices; 3278 head = &info->fs_devices->devices;
3246 list_for_each_entry_rcu(dev, head, dev_list) { 3279 list_for_each_entry_rcu(dev, head, dev_list) {
3280 if (dev->missing)
3281 continue;
3247 if (!dev->bdev) { 3282 if (!dev->bdev) {
3248 errors_send++; 3283 errors_send++;
3249 continue; 3284 continue;
@@ -3258,6 +3293,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3258 3293
3259 /* wait for all the barriers */ 3294 /* wait for all the barriers */
3260 list_for_each_entry_rcu(dev, head, dev_list) { 3295 list_for_each_entry_rcu(dev, head, dev_list) {
3296 if (dev->missing)
3297 continue;
3261 if (!dev->bdev) { 3298 if (!dev->bdev) {
3262 errors_wait++; 3299 errors_wait++;
3263 continue; 3300 continue;
@@ -3477,6 +3514,8 @@ static void free_fs_root(struct btrfs_root *root)
3477 root->orphan_block_rsv = NULL; 3514 root->orphan_block_rsv = NULL;
3478 if (root->anon_dev) 3515 if (root->anon_dev)
3479 free_anon_bdev(root->anon_dev); 3516 free_anon_bdev(root->anon_dev);
3517 if (root->subv_writers)
3518 btrfs_free_subvolume_writers(root->subv_writers);
3480 free_extent_buffer(root->node); 3519 free_extent_buffer(root->node);
3481 free_extent_buffer(root->commit_root); 3520 free_extent_buffer(root->commit_root);
3482 kfree(root->free_ino_ctl); 3521 kfree(root->free_ino_ctl);
@@ -3610,6 +3649,7 @@ int close_ctree(struct btrfs_root *root)
3610 3649
3611 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 3650 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3612 percpu_counter_destroy(&fs_info->delalloc_bytes); 3651 percpu_counter_destroy(&fs_info->delalloc_bytes);
3652 percpu_counter_destroy(&fs_info->bio_counter);
3613 bdi_destroy(&fs_info->bdi); 3653 bdi_destroy(&fs_info->bdi);
3614 cleanup_srcu_struct(&fs_info->subvol_srcu); 3654 cleanup_srcu_struct(&fs_info->subvol_srcu);
3615 3655
@@ -3791,9 +3831,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3791 list_move_tail(&root->ordered_root, 3831 list_move_tail(&root->ordered_root,
3792 &fs_info->ordered_roots); 3832 &fs_info->ordered_roots);
3793 3833
3834 spin_unlock(&fs_info->ordered_root_lock);
3794 btrfs_destroy_ordered_extents(root); 3835 btrfs_destroy_ordered_extents(root);
3795 3836
3796 cond_resched_lock(&fs_info->ordered_root_lock); 3837 cond_resched();
3838 spin_lock(&fs_info->ordered_root_lock);
3797 } 3839 }
3798 spin_unlock(&fs_info->ordered_root_lock); 3840 spin_unlock(&fs_info->ordered_root_lock);
3799} 3841}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..1306487c82cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -419,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work)
419again: 419again:
420 mutex_lock(&caching_ctl->mutex); 420 mutex_lock(&caching_ctl->mutex);
421 /* need to make sure the commit_root doesn't disappear */ 421 /* need to make sure the commit_root doesn't disappear */
422 down_read(&fs_info->extent_commit_sem); 422 down_read(&fs_info->commit_root_sem);
423 423
424next: 424next:
425 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 425 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -443,10 +443,10 @@ next:
443 break; 443 break;
444 444
445 if (need_resched() || 445 if (need_resched() ||
446 rwsem_is_contended(&fs_info->extent_commit_sem)) { 446 rwsem_is_contended(&fs_info->commit_root_sem)) {
447 caching_ctl->progress = last; 447 caching_ctl->progress = last;
448 btrfs_release_path(path); 448 btrfs_release_path(path);
449 up_read(&fs_info->extent_commit_sem); 449 up_read(&fs_info->commit_root_sem);
450 mutex_unlock(&caching_ctl->mutex); 450 mutex_unlock(&caching_ctl->mutex);
451 cond_resched(); 451 cond_resched();
452 goto again; 452 goto again;
@@ -513,7 +513,7 @@ next:
513 513
514err: 514err:
515 btrfs_free_path(path); 515 btrfs_free_path(path);
516 up_read(&fs_info->extent_commit_sem); 516 up_read(&fs_info->commit_root_sem);
517 517
518 free_excluded_extents(extent_root, block_group); 518 free_excluded_extents(extent_root, block_group);
519 519
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
549 caching_ctl->block_group = cache; 549 caching_ctl->block_group = cache;
550 caching_ctl->progress = cache->key.objectid; 550 caching_ctl->progress = cache->key.objectid;
551 atomic_set(&caching_ctl->count, 1); 551 atomic_set(&caching_ctl->count, 1);
552 caching_ctl->work.func = caching_thread; 552 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
553 553
554 spin_lock(&cache->lock); 554 spin_lock(&cache->lock);
555 /* 555 /*
@@ -633,14 +633,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
633 return 0; 633 return 0;
634 } 634 }
635 635
636 down_write(&fs_info->extent_commit_sem); 636 down_write(&fs_info->commit_root_sem);
637 atomic_inc(&caching_ctl->count); 637 atomic_inc(&caching_ctl->count);
638 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 638 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
639 up_write(&fs_info->extent_commit_sem); 639 up_write(&fs_info->commit_root_sem);
640 640
641 btrfs_get_block_group(cache); 641 btrfs_get_block_group(cache);
642 642
643 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 643 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
644 644
645 return ret; 645 return ret;
646} 646}
@@ -2444,7 +2444,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2444 spin_unlock(&locked_ref->lock); 2444 spin_unlock(&locked_ref->lock);
2445 spin_lock(&delayed_refs->lock); 2445 spin_lock(&delayed_refs->lock);
2446 spin_lock(&locked_ref->lock); 2446 spin_lock(&locked_ref->lock);
2447 if (rb_first(&locked_ref->ref_root)) { 2447 if (rb_first(&locked_ref->ref_root) ||
2448 locked_ref->extent_op) {
2448 spin_unlock(&locked_ref->lock); 2449 spin_unlock(&locked_ref->lock);
2449 spin_unlock(&delayed_refs->lock); 2450 spin_unlock(&delayed_refs->lock);
2450 continue; 2451 continue;
@@ -3971,7 +3972,7 @@ static int can_overcommit(struct btrfs_root *root,
3971} 3972}
3972 3973
3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3974static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3974 unsigned long nr_pages) 3975 unsigned long nr_pages, int nr_items)
3975{ 3976{
3976 struct super_block *sb = root->fs_info->sb; 3977 struct super_block *sb = root->fs_info->sb;
3977 3978
@@ -3986,9 +3987,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3986 * the filesystem is readonly(all dirty pages are written to 3987 * the filesystem is readonly(all dirty pages are written to
3987 * the disk). 3988 * the disk).
3988 */ 3989 */
3989 btrfs_start_delalloc_roots(root->fs_info, 0); 3990 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
3990 if (!current->journal_info) 3991 if (!current->journal_info)
3991 btrfs_wait_ordered_roots(root->fs_info, -1); 3992 btrfs_wait_ordered_roots(root->fs_info, nr_items);
3992 } 3993 }
3993} 3994}
3994 3995
@@ -4045,7 +4046,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4045 while (delalloc_bytes && loops < 3) { 4046 while (delalloc_bytes && loops < 3) {
4046 max_reclaim = min(delalloc_bytes, to_reclaim); 4047 max_reclaim = min(delalloc_bytes, to_reclaim);
4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4048 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4048 btrfs_writeback_inodes_sb_nr(root, nr_pages); 4049 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4049 /* 4050 /*
4050 * We need to wait for the async pages to actually start before 4051 * We need to wait for the async pages to actually start before
4051 * we do anything. 4052 * we do anything.
@@ -4112,13 +4113,9 @@ static int may_commit_transaction(struct btrfs_root *root,
4112 goto commit; 4113 goto commit;
4113 4114
4114 /* See if there is enough pinned space to make this reservation */ 4115 /* See if there is enough pinned space to make this reservation */
4115 spin_lock(&space_info->lock);
4116 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4116 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4117 bytes) >= 0) { 4117 bytes) >= 0)
4118 spin_unlock(&space_info->lock);
4119 goto commit; 4118 goto commit;
4120 }
4121 spin_unlock(&space_info->lock);
4122 4119
4123 /* 4120 /*
4124 * See if there is some space in the delayed insertion reservation for 4121 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4124,13 @@ static int may_commit_transaction(struct btrfs_root *root,
4127 if (space_info != delayed_rsv->space_info) 4124 if (space_info != delayed_rsv->space_info)
4128 return -ENOSPC; 4125 return -ENOSPC;
4129 4126
4130 spin_lock(&space_info->lock);
4131 spin_lock(&delayed_rsv->lock); 4127 spin_lock(&delayed_rsv->lock);
4132 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4128 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4133 bytes - delayed_rsv->size) >= 0) { 4129 bytes - delayed_rsv->size) >= 0) {
4134 spin_unlock(&delayed_rsv->lock); 4130 spin_unlock(&delayed_rsv->lock);
4135 spin_unlock(&space_info->lock);
4136 return -ENOSPC; 4131 return -ENOSPC;
4137 } 4132 }
4138 spin_unlock(&delayed_rsv->lock); 4133 spin_unlock(&delayed_rsv->lock);
4139 spin_unlock(&space_info->lock);
4140 4134
4141commit: 4135commit:
4142 trans = btrfs_join_transaction(root); 4136 trans = btrfs_join_transaction(root);
@@ -4181,7 +4175,7 @@ static int flush_space(struct btrfs_root *root,
4181 break; 4175 break;
4182 case FLUSH_DELALLOC: 4176 case FLUSH_DELALLOC:
4183 case FLUSH_DELALLOC_WAIT: 4177 case FLUSH_DELALLOC_WAIT:
4184 shrink_delalloc(root, num_bytes, orig_bytes, 4178 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4185 state == FLUSH_DELALLOC_WAIT); 4179 state == FLUSH_DELALLOC_WAIT);
4186 break; 4180 break;
4187 case ALLOC_CHUNK: 4181 case ALLOC_CHUNK:
@@ -5477,7 +5471,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5477 struct btrfs_block_group_cache *cache; 5471 struct btrfs_block_group_cache *cache;
5478 struct btrfs_space_info *space_info; 5472 struct btrfs_space_info *space_info;
5479 5473
5480 down_write(&fs_info->extent_commit_sem); 5474 down_write(&fs_info->commit_root_sem);
5481 5475
5482 list_for_each_entry_safe(caching_ctl, next, 5476 list_for_each_entry_safe(caching_ctl, next,
5483 &fs_info->caching_block_groups, list) { 5477 &fs_info->caching_block_groups, list) {
@@ -5496,7 +5490,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5496 else 5490 else
5497 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5491 fs_info->pinned_extents = &fs_info->freed_extents[0];
5498 5492
5499 up_write(&fs_info->extent_commit_sem); 5493 up_write(&fs_info->commit_root_sem);
5500 5494
5501 list_for_each_entry_rcu(space_info, &fs_info->space_info, list) 5495 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5502 percpu_counter_set(&space_info->total_bytes_pinned, 0); 5496 percpu_counter_set(&space_info->total_bytes_pinned, 0);
@@ -5751,6 +5745,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5751 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5745 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
5752 bytenr, parent, root_objectid, owner_objectid, 5746 bytenr, parent, root_objectid, owner_objectid,
5753 owner_offset); 5747 owner_offset);
5748 btrfs_abort_transaction(trans, extent_root, ret);
5749 goto out;
5754 } else { 5750 } else {
5755 btrfs_abort_transaction(trans, extent_root, ret); 5751 btrfs_abort_transaction(trans, extent_root, ret);
5756 goto out; 5752 goto out;
@@ -8262,14 +8258,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8262 struct btrfs_caching_control *caching_ctl; 8258 struct btrfs_caching_control *caching_ctl;
8263 struct rb_node *n; 8259 struct rb_node *n;
8264 8260
8265 down_write(&info->extent_commit_sem); 8261 down_write(&info->commit_root_sem);
8266 while (!list_empty(&info->caching_block_groups)) { 8262 while (!list_empty(&info->caching_block_groups)) {
8267 caching_ctl = list_entry(info->caching_block_groups.next, 8263 caching_ctl = list_entry(info->caching_block_groups.next,
8268 struct btrfs_caching_control, list); 8264 struct btrfs_caching_control, list);
8269 list_del(&caching_ctl->list); 8265 list_del(&caching_ctl->list);
8270 put_caching_control(caching_ctl); 8266 put_caching_control(caching_ctl);
8271 } 8267 }
8272 up_write(&info->extent_commit_sem); 8268 up_write(&info->commit_root_sem);
8273 8269
8274 spin_lock(&info->block_group_cache_lock); 8270 spin_lock(&info->block_group_cache_lock);
8275 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8271 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8343,9 +8339,15 @@ static void __link_block_group(struct btrfs_space_info *space_info,
8343 struct btrfs_block_group_cache *cache) 8339 struct btrfs_block_group_cache *cache)
8344{ 8340{
8345 int index = get_block_group_index(cache); 8341 int index = get_block_group_index(cache);
8342 bool first = false;
8346 8343
8347 down_write(&space_info->groups_sem); 8344 down_write(&space_info->groups_sem);
8348 if (list_empty(&space_info->block_groups[index])) { 8345 if (list_empty(&space_info->block_groups[index]))
8346 first = true;
8347 list_add_tail(&cache->list, &space_info->block_groups[index]);
8348 up_write(&space_info->groups_sem);
8349
8350 if (first) {
8349 struct kobject *kobj = &space_info->block_group_kobjs[index]; 8351 struct kobject *kobj = &space_info->block_group_kobjs[index];
8350 int ret; 8352 int ret;
8351 8353
@@ -8357,8 +8359,6 @@ static void __link_block_group(struct btrfs_space_info *space_info,
8357 kobject_put(&space_info->kobj); 8359 kobject_put(&space_info->kobj);
8358 } 8360 }
8359 } 8361 }
8360 list_add_tail(&cache->list, &space_info->block_groups[index]);
8361 up_write(&space_info->groups_sem);
8362} 8362}
8363 8363
8364static struct btrfs_block_group_cache * 8364static struct btrfs_block_group_cache *
@@ -8938,3 +8938,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8938 range->len = trimmed; 8938 range->len = trimmed;
8939 return ret; 8939 return ret;
8940} 8940}
8941
8942/*
8943 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
8944 * they are used to prevent the some tasks writing data into the page cache
8945 * by nocow before the subvolume is snapshoted, but flush the data into
8946 * the disk after the snapshot creation.
8947 */
8948void btrfs_end_nocow_write(struct btrfs_root *root)
8949{
8950 percpu_counter_dec(&root->subv_writers->counter);
8951 /*
8952 * Make sure counter is updated before we wake up
8953 * waiters.
8954 */
8955 smp_mb();
8956 if (waitqueue_active(&root->subv_writers->wait))
8957 wake_up(&root->subv_writers->wait);
8958}
8959
8960int btrfs_start_nocow_write(struct btrfs_root *root)
8961{
8962 if (unlikely(atomic_read(&root->will_be_snapshoted)))
8963 return 0;
8964
8965 percpu_counter_inc(&root->subv_writers->counter);
8966 /*
8967 * Make sure counter is updated before we check for snapshot creation.
8968 */
8969 smp_mb();
8970 if (unlikely(atomic_read(&root->will_be_snapshoted))) {
8971 btrfs_end_nocow_write(root);
8972 return 0;
8973 }
8974 return 1;
8975}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..3955e475ceec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
229 } 229 }
230} 230}
231 231
232static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 232static struct rb_node *tree_insert(struct rb_root *root,
233 struct rb_node *search_start,
234 u64 offset,
233 struct rb_node *node, 235 struct rb_node *node,
234 struct rb_node ***p_in, 236 struct rb_node ***p_in,
235 struct rb_node **parent_in) 237 struct rb_node **parent_in)
236{ 238{
237 struct rb_node **p = &root->rb_node; 239 struct rb_node **p;
238 struct rb_node *parent = NULL; 240 struct rb_node *parent = NULL;
239 struct tree_entry *entry; 241 struct tree_entry *entry;
240 242
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
244 goto do_insert; 246 goto do_insert;
245 } 247 }
246 248
249 p = search_start ? &search_start : &root->rb_node;
247 while (*p) { 250 while (*p) {
248 parent = *p; 251 parent = *p;
249 entry = rb_entry(parent, struct tree_entry, rb_node); 252 entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
430 433
431 set_state_bits(tree, state, bits); 434 set_state_bits(tree, state, bits);
432 435
433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent); 436 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
434 if (node) { 437 if (node) {
435 struct extent_state *found; 438 struct extent_state *found;
436 found = rb_entry(node, struct extent_state, rb_node); 439 found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
477 prealloc->state = orig->state; 480 prealloc->state = orig->state;
478 orig->start = split; 481 orig->start = split;
479 482
480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, 483 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
481 NULL, NULL); 484 &prealloc->rb_node, NULL, NULL);
482 if (node) { 485 if (node) {
483 free_extent_state(prealloc); 486 free_extent_state(prealloc);
484 return -EEXIST; 487 return -EEXIST;
@@ -746,6 +749,7 @@ again:
746 * our range starts 749 * our range starts
747 */ 750 */
748 node = tree_search(tree, start); 751 node = tree_search(tree, start);
752process_node:
749 if (!node) 753 if (!node)
750 break; 754 break;
751 755
@@ -766,7 +770,10 @@ again:
766 if (start > end) 770 if (start > end)
767 break; 771 break;
768 772
769 cond_resched_lock(&tree->lock); 773 if (!cond_resched_lock(&tree->lock)) {
774 node = rb_next(node);
775 goto process_node;
776 }
770 } 777 }
771out: 778out:
772 spin_unlock(&tree->lock); 779 spin_unlock(&tree->lock);
@@ -2757,7 +2764,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2757 2764
2758 if (em_cached && *em_cached) { 2765 if (em_cached && *em_cached) {
2759 em = *em_cached; 2766 em = *em_cached;
2760 if (em->in_tree && start >= em->start && 2767 if (extent_map_in_tree(em) && start >= em->start &&
2761 start < extent_map_end(em)) { 2768 start < extent_map_end(em)) {
2762 atomic_inc(&em->refs); 2769 atomic_inc(&em->refs);
2763 return em; 2770 return em;
@@ -4303,7 +4310,7 @@ static void __free_extent_buffer(struct extent_buffer *eb)
4303 kmem_cache_free(extent_buffer_cache, eb); 4310 kmem_cache_free(extent_buffer_cache, eb);
4304} 4311}
4305 4312
4306static int extent_buffer_under_io(struct extent_buffer *eb) 4313int extent_buffer_under_io(struct extent_buffer *eb)
4307{ 4314{
4308 return (atomic_read(&eb->io_pages) || 4315 return (atomic_read(&eb->io_pages) ||
4309 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4316 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 58b27e5ab521..c488b45237bf 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -320,6 +320,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb);
320int set_extent_buffer_uptodate(struct extent_buffer *eb); 320int set_extent_buffer_uptodate(struct extent_buffer *eb);
321int clear_extent_buffer_uptodate(struct extent_buffer *eb); 321int clear_extent_buffer_uptodate(struct extent_buffer *eb);
322int extent_buffer_uptodate(struct extent_buffer *eb); 322int extent_buffer_uptodate(struct extent_buffer *eb);
323int extent_buffer_under_io(struct extent_buffer *eb);
323int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, 324int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
324 unsigned long min_len, char **map, 325 unsigned long min_len, char **map,
325 unsigned long *map_start, 326 unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
52 if (!em) 52 if (!em)
53 return NULL; 53 return NULL;
54 em->in_tree = 0; 54 RB_CLEAR_NODE(&em->rb_node);
55 em->flags = 0; 55 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 56 em->compress_type = BTRFS_COMPRESS_NONE;
57 em->generation = 0; 57 em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
73 return; 73 return;
74 WARN_ON(atomic_read(&em->refs) == 0); 74 WARN_ON(atomic_read(&em->refs) == 0);
75 if (atomic_dec_and_test(&em->refs)) { 75 if (atomic_dec_and_test(&em->refs)) {
76 WARN_ON(em->in_tree); 76 WARN_ON(extent_map_in_tree(em));
77 WARN_ON(!list_empty(&em->list)); 77 WARN_ON(!list_empty(&em->list));
78 kmem_cache_free(extent_map_cache, em); 78 kmem_cache_free(extent_map_cache, em);
79 } 79 }
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
99 parent = *p; 99 parent = *p;
100 entry = rb_entry(parent, struct extent_map, rb_node); 100 entry = rb_entry(parent, struct extent_map, rb_node);
101 101
102 WARN_ON(!entry->in_tree);
103
104 if (em->start < entry->start) 102 if (em->start < entry->start)
105 p = &(*p)->rb_left; 103 p = &(*p)->rb_left;
106 else if (em->start >= extent_map_end(entry)) 104 else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
128 if (end > entry->start && em->start < extent_map_end(entry)) 126 if (end > entry->start && em->start < extent_map_end(entry))
129 return -EEXIST; 127 return -EEXIST;
130 128
131 em->in_tree = 1;
132 rb_link_node(&em->rb_node, orig_parent, p); 129 rb_link_node(&em->rb_node, orig_parent, p);
133 rb_insert_color(&em->rb_node, root); 130 rb_insert_color(&em->rb_node, root);
134 return 0; 131 return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
153 prev = n; 150 prev = n;
154 prev_entry = entry; 151 prev_entry = entry;
155 152
156 WARN_ON(!entry->in_tree);
157
158 if (offset < entry->start) 153 if (offset < entry->start)
159 n = n->rb_left; 154 n = n->rb_left;
160 else if (offset >= extent_map_end(entry)) 155 else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
240 em->len += merge->len; 235 em->len += merge->len;
241 em->block_len += merge->block_len; 236 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 237 em->block_start = merge->block_start;
243 merge->in_tree = 0;
244 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 238 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
245 em->mod_start = merge->mod_start; 239 em->mod_start = merge->mod_start;
246 em->generation = max(em->generation, merge->generation); 240 em->generation = max(em->generation, merge->generation);
247 241
248 rb_erase(&merge->rb_node, &tree->map); 242 rb_erase(&merge->rb_node, &tree->map);
243 RB_CLEAR_NODE(&merge->rb_node);
249 free_extent_map(merge); 244 free_extent_map(merge);
250 } 245 }
251 } 246 }
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
257 em->len += merge->len; 252 em->len += merge->len;
258 em->block_len += merge->block_len; 253 em->block_len += merge->block_len;
259 rb_erase(&merge->rb_node, &tree->map); 254 rb_erase(&merge->rb_node, &tree->map);
260 merge->in_tree = 0; 255 RB_CLEAR_NODE(&merge->rb_node);
261 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 256 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
262 em->generation = max(em->generation, merge->generation); 257 em->generation = max(em->generation, merge->generation);
263 free_extent_map(merge); 258 free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
319void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) 314void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
320{ 315{
321 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 316 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
322 if (em->in_tree) 317 if (extent_map_in_tree(em))
318 try_merge_map(tree, em);
319}
320
321static inline void setup_extent_mapping(struct extent_map_tree *tree,
322 struct extent_map *em,
323 int modified)
324{
325 atomic_inc(&em->refs);
326 em->mod_start = em->start;
327 em->mod_len = em->len;
328
329 if (modified)
330 list_move(&em->list, &tree->modified_extents);
331 else
323 try_merge_map(tree, em); 332 try_merge_map(tree, em);
324} 333}
325 334
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
342 if (ret) 351 if (ret)
343 goto out; 352 goto out;
344 353
345 atomic_inc(&em->refs); 354 setup_extent_mapping(tree, em, modified);
346
347 em->mod_start = em->start;
348 em->mod_len = em->len;
349
350 if (modified)
351 list_move(&em->list, &tree->modified_extents);
352 else
353 try_merge_map(tree, em);
354out: 355out:
355 return ret; 356 return ret;
356} 357}
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
434 rb_erase(&em->rb_node, &tree->map); 435 rb_erase(&em->rb_node, &tree->map);
435 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 436 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
436 list_del_init(&em->list); 437 list_del_init(&em->list);
437 em->in_tree = 0; 438 RB_CLEAR_NODE(&em->rb_node);
438 return ret; 439 return ret;
439} 440}
441
442void replace_extent_mapping(struct extent_map_tree *tree,
443 struct extent_map *cur,
444 struct extent_map *new,
445 int modified)
446{
447 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
448 ASSERT(extent_map_in_tree(cur));
449 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
450 list_del_init(&cur->list);
451 rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
452 RB_CLEAR_NODE(&cur->rb_node);
453
454 setup_extent_mapping(tree, new, modified);
455}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
33 unsigned long flags; 33 unsigned long flags;
34 struct block_device *bdev; 34 struct block_device *bdev;
35 atomic_t refs; 35 atomic_t refs;
36 unsigned int in_tree;
37 unsigned int compress_type; 36 unsigned int compress_type;
38 struct list_head list; 37 struct list_head list;
39}; 38};
@@ -44,6 +43,11 @@ struct extent_map_tree {
44 rwlock_t lock; 43 rwlock_t lock;
45}; 44};
46 45
46static inline int extent_map_in_tree(const struct extent_map *em)
47{
48 return !RB_EMPTY_NODE(&em->rb_node);
49}
50
47static inline u64 extent_map_end(struct extent_map *em) 51static inline u64 extent_map_end(struct extent_map *em)
48{ 52{
49 if (em->start + em->len < em->start) 53 if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
64int add_extent_mapping(struct extent_map_tree *tree, 68int add_extent_mapping(struct extent_map_tree *tree,
65 struct extent_map *em, int modified); 69 struct extent_map *em, int modified);
66int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); 70int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
71void replace_extent_mapping(struct extent_map_tree *tree,
72 struct extent_map *cur,
73 struct extent_map *new,
74 int modified);
67 75
68struct extent_map *alloc_extent_map(void); 76struct extent_map *alloc_extent_map(void);
69void free_extent_map(struct extent_map *em); 77void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..eb742c07e7a4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
425 struct page *page = prepared_pages[pg]; 425 struct page *page = prepared_pages[pg];
426 /* 426 /*
427 * Copy data from userspace to the current page 427 * Copy data from userspace to the current page
428 *
429 * Disable pagefault to avoid recursive lock since
430 * the pages are already locked
431 */ 428 */
432 pagefault_disable();
433 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 429 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
434 pagefault_enable();
435 430
436 /* Flush processor's dcache for this page */ 431 /* Flush processor's dcache for this page */
437 flush_dcache_page(page); 432 flush_dcache_page(page);
@@ -591,7 +586,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
591 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 586 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
592 clear_bit(EXTENT_FLAG_LOGGING, &flags); 587 clear_bit(EXTENT_FLAG_LOGGING, &flags);
593 modified = !list_empty(&em->list); 588 modified = !list_empty(&em->list);
594 remove_extent_mapping(em_tree, em);
595 if (no_splits) 589 if (no_splits)
596 goto next; 590 goto next;
597 591
@@ -622,8 +616,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
622 split->bdev = em->bdev; 616 split->bdev = em->bdev;
623 split->flags = flags; 617 split->flags = flags;
624 split->compress_type = em->compress_type; 618 split->compress_type = em->compress_type;
625 ret = add_extent_mapping(em_tree, split, modified); 619 replace_extent_mapping(em_tree, em, split, modified);
626 BUG_ON(ret); /* Logic error */
627 free_extent_map(split); 620 free_extent_map(split);
628 split = split2; 621 split = split2;
629 split2 = NULL; 622 split2 = NULL;
@@ -661,12 +654,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
661 split->orig_block_len = 0; 654 split->orig_block_len = 0;
662 } 655 }
663 656
664 ret = add_extent_mapping(em_tree, split, modified); 657 if (extent_map_in_tree(em)) {
665 BUG_ON(ret); /* Logic error */ 658 replace_extent_mapping(em_tree, em, split,
659 modified);
660 } else {
661 ret = add_extent_mapping(em_tree, split,
662 modified);
663 ASSERT(ret == 0); /* Logic error */
664 }
666 free_extent_map(split); 665 free_extent_map(split);
667 split = NULL; 666 split = NULL;
668 } 667 }
669next: 668next:
669 if (extent_map_in_tree(em))
670 remove_extent_mapping(em_tree, em);
670 write_unlock(&em_tree->lock); 671 write_unlock(&em_tree->lock);
671 672
672 /* once for us */ 673 /* once for us */
@@ -720,7 +721,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
720 if (drop_cache) 721 if (drop_cache)
721 btrfs_drop_extent_cache(inode, start, end - 1, 0); 722 btrfs_drop_extent_cache(inode, start, end - 1, 0);
722 723
723 if (start >= BTRFS_I(inode)->disk_i_size) 724 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
724 modify_tree = 0; 725 modify_tree = 0;
725 726
726 while (1) { 727 while (1) {
@@ -798,7 +799,10 @@ next_slot:
798 */ 799 */
799 if (start > key.offset && end < extent_end) { 800 if (start > key.offset && end < extent_end) {
800 BUG_ON(del_nr > 0); 801 BUG_ON(del_nr > 0);
801 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 802 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
803 ret = -EINVAL;
804 break;
805 }
802 806
803 memcpy(&new_key, &key, sizeof(new_key)); 807 memcpy(&new_key, &key, sizeof(new_key));
804 new_key.offset = start; 808 new_key.offset = start;
@@ -841,7 +845,10 @@ next_slot:
841 * | -------- extent -------- | 845 * | -------- extent -------- |
842 */ 846 */
843 if (start <= key.offset && end < extent_end) { 847 if (start <= key.offset && end < extent_end) {
844 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 848 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
849 ret = -EINVAL;
850 break;
851 }
845 852
846 memcpy(&new_key, &key, sizeof(new_key)); 853 memcpy(&new_key, &key, sizeof(new_key));
847 new_key.offset = end; 854 new_key.offset = end;
@@ -864,7 +871,10 @@ next_slot:
864 */ 871 */
865 if (start > key.offset && end >= extent_end) { 872 if (start > key.offset && end >= extent_end) {
866 BUG_ON(del_nr > 0); 873 BUG_ON(del_nr > 0);
867 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 874 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
875 ret = -EINVAL;
876 break;
877 }
868 878
869 btrfs_set_file_extent_num_bytes(leaf, fi, 879 btrfs_set_file_extent_num_bytes(leaf, fi,
870 start - key.offset); 880 start - key.offset);
@@ -938,34 +948,42 @@ next_slot:
938 * Set path->slots[0] to first slot, so that after the delete 948 * Set path->slots[0] to first slot, so that after the delete
939 * if items are move off from our leaf to its immediate left or 949 * if items are move off from our leaf to its immediate left or
940 * right neighbor leafs, we end up with a correct and adjusted 950 * right neighbor leafs, we end up with a correct and adjusted
941 * path->slots[0] for our insertion. 951 * path->slots[0] for our insertion (if replace_extent != 0).
942 */ 952 */
943 path->slots[0] = del_slot; 953 path->slots[0] = del_slot;
944 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 954 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
945 if (ret) 955 if (ret)
946 btrfs_abort_transaction(trans, root, ret); 956 btrfs_abort_transaction(trans, root, ret);
957 }
947 958
948 leaf = path->nodes[0]; 959 leaf = path->nodes[0];
949 /* 960 /*
950 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that 961 * If btrfs_del_items() was called, it might have deleted a leaf, in
951 * is, its contents got pushed to its neighbors), in which case 962 * which case it unlocked our path, so check path->locks[0] matches a
952 * it means path->locks[0] == 0 963 * write lock.
953 */ 964 */
954 if (!ret && replace_extent && leafs_visited == 1 && 965 if (!ret && replace_extent && leafs_visited == 1 &&
955 path->locks[0] && 966 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
956 btrfs_leaf_free_space(root, leaf) >= 967 path->locks[0] == BTRFS_WRITE_LOCK) &&
957 sizeof(struct btrfs_item) + extent_item_size) { 968 btrfs_leaf_free_space(root, leaf) >=
958 969 sizeof(struct btrfs_item) + extent_item_size) {
959 key.objectid = ino; 970
960 key.type = BTRFS_EXTENT_DATA_KEY; 971 key.objectid = ino;
961 key.offset = start; 972 key.type = BTRFS_EXTENT_DATA_KEY;
962 setup_items_for_insert(root, path, &key, 973 key.offset = start;
963 &extent_item_size, 974 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
964 extent_item_size, 975 struct btrfs_key slot_key;
965 sizeof(struct btrfs_item) + 976
966 extent_item_size, 1); 977 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
967 *key_inserted = 1; 978 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
979 path->slots[0]++;
968 } 980 }
981 setup_items_for_insert(root, path, &key,
982 &extent_item_size,
983 extent_item_size,
984 sizeof(struct btrfs_item) +
985 extent_item_size, 1);
986 *key_inserted = 1;
969 } 987 }
970 988
971 if (!replace_extent || !(*key_inserted)) 989 if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1364,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1346 struct btrfs_ordered_extent *ordered; 1364 struct btrfs_ordered_extent *ordered;
1347 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1365 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1348 start_pos, last_pos, 0, cached_state); 1366 start_pos, last_pos, 0, cached_state);
1349 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); 1367 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1368 last_pos - start_pos + 1);
1350 if (ordered && 1369 if (ordered &&
1351 ordered->file_offset + ordered->len > start_pos && 1370 ordered->file_offset + ordered->len > start_pos &&
1352 ordered->file_offset <= last_pos) { 1371 ordered->file_offset <= last_pos) {
1353 btrfs_put_ordered_extent(ordered);
1354 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1372 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1355 start_pos, last_pos, 1373 start_pos, last_pos,
1356 cached_state, GFP_NOFS); 1374 cached_state, GFP_NOFS);
@@ -1358,12 +1376,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1358 unlock_page(pages[i]); 1376 unlock_page(pages[i]);
1359 page_cache_release(pages[i]); 1377 page_cache_release(pages[i]);
1360 } 1378 }
1361 ret = btrfs_wait_ordered_range(inode, start_pos, 1379 btrfs_start_ordered_extent(inode, ordered, 1);
1362 last_pos - start_pos + 1); 1380 btrfs_put_ordered_extent(ordered);
1363 if (ret) 1381 return -EAGAIN;
1364 return ret;
1365 else
1366 return -EAGAIN;
1367 } 1382 }
1368 if (ordered) 1383 if (ordered)
1369 btrfs_put_ordered_extent(ordered); 1384 btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1411,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1396 u64 num_bytes; 1411 u64 num_bytes;
1397 int ret; 1412 int ret;
1398 1413
1414 ret = btrfs_start_nocow_write(root);
1415 if (!ret)
1416 return -ENOSPC;
1417
1399 lockstart = round_down(pos, root->sectorsize); 1418 lockstart = round_down(pos, root->sectorsize);
1400 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; 1419 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1401 1420
1402 while (1) { 1421 while (1) {
1403 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1422 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1434,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1415 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1434 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1416 if (ret <= 0) { 1435 if (ret <= 0) {
1417 ret = 0; 1436 ret = 0;
1437 btrfs_end_nocow_write(root);
1418 } else { 1438 } else {
1419 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1439 *write_bytes = min_t(size_t, *write_bytes ,
1420 EXTENT_DIRTY | EXTENT_DELALLOC | 1440 num_bytes - pos + lockstart);
1421 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1422 NULL, GFP_NOFS);
1423 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1424 } 1441 }
1425 1442
1426 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1443 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1527,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 if (!only_release_metadata) 1527 if (!only_release_metadata)
1511 btrfs_free_reserved_data_space(inode, 1528 btrfs_free_reserved_data_space(inode,
1512 reserve_bytes); 1529 reserve_bytes);
1530 else
1531 btrfs_end_nocow_write(root);
1513 break; 1532 break;
1514 } 1533 }
1515 1534
@@ -1598,6 +1617,9 @@ again:
1598 } 1617 }
1599 1618
1600 release_bytes = 0; 1619 release_bytes = 0;
1620 if (only_release_metadata)
1621 btrfs_end_nocow_write(root);
1622
1601 if (only_release_metadata && copied > 0) { 1623 if (only_release_metadata && copied > 0) {
1602 u64 lockstart = round_down(pos, root->sectorsize); 1624 u64 lockstart = round_down(pos, root->sectorsize);
1603 u64 lockend = lockstart + 1625 u64 lockend = lockstart +
@@ -1624,10 +1646,12 @@ again:
1624 kfree(pages); 1646 kfree(pages);
1625 1647
1626 if (release_bytes) { 1648 if (release_bytes) {
1627 if (only_release_metadata) 1649 if (only_release_metadata) {
1650 btrfs_end_nocow_write(root);
1628 btrfs_delalloc_release_metadata(inode, release_bytes); 1651 btrfs_delalloc_release_metadata(inode, release_bytes);
1629 else 1652 } else {
1630 btrfs_delalloc_release_space(inode, release_bytes); 1653 btrfs_delalloc_release_space(inode, release_bytes);
1654 }
1631 } 1655 }
1632 1656
1633 return num_written ? num_written : ret; 1657 return num_written ? num_written : ret;
@@ -1636,7 +1660,7 @@ again:
1636static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1660static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1637 const struct iovec *iov, 1661 const struct iovec *iov,
1638 unsigned long nr_segs, loff_t pos, 1662 unsigned long nr_segs, loff_t pos,
1639 loff_t *ppos, size_t count, size_t ocount) 1663 size_t count, size_t ocount)
1640{ 1664{
1641 struct file *file = iocb->ki_filp; 1665 struct file *file = iocb->ki_filp;
1642 struct iov_iter i; 1666 struct iov_iter i;
@@ -1645,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1645 loff_t endbyte; 1669 loff_t endbyte;
1646 int err; 1670 int err;
1647 1671
1648 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, 1672 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
1649 count, ocount); 1673 count, ocount);
1650 1674
1651 if (written < 0 || written == count) 1675 if (written < 0 || written == count)
@@ -1664,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1664 if (err) 1688 if (err)
1665 goto out; 1689 goto out;
1666 written += written_buffered; 1690 written += written_buffered;
1667 *ppos = pos + written_buffered; 1691 iocb->ki_pos = pos + written_buffered;
1668 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, 1692 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1669 endbyte >> PAGE_CACHE_SHIFT); 1693 endbyte >> PAGE_CACHE_SHIFT);
1670out: 1694out:
@@ -1696,8 +1720,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1696 struct file *file = iocb->ki_filp; 1720 struct file *file = iocb->ki_filp;
1697 struct inode *inode = file_inode(file); 1721 struct inode *inode = file_inode(file);
1698 struct btrfs_root *root = BTRFS_I(inode)->root; 1722 struct btrfs_root *root = BTRFS_I(inode)->root;
1699 loff_t *ppos = &iocb->ki_pos;
1700 u64 start_pos; 1723 u64 start_pos;
1724 u64 end_pos;
1701 ssize_t num_written = 0; 1725 ssize_t num_written = 0;
1702 ssize_t err = 0; 1726 ssize_t err = 0;
1703 size_t count, ocount; 1727 size_t count, ocount;
@@ -1752,7 +1776,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1752 1776
1753 start_pos = round_down(pos, root->sectorsize); 1777 start_pos = round_down(pos, root->sectorsize);
1754 if (start_pos > i_size_read(inode)) { 1778 if (start_pos > i_size_read(inode)) {
1755 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); 1779 /* Expand hole size to cover write data, preventing empty gap */
1780 end_pos = round_up(pos + iov->iov_len, root->sectorsize);
1781 err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
1756 if (err) { 1782 if (err) {
1757 mutex_unlock(&inode->i_mutex); 1783 mutex_unlock(&inode->i_mutex);
1758 goto out; 1784 goto out;
@@ -1764,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1764 1790
1765 if (unlikely(file->f_flags & O_DIRECT)) { 1791 if (unlikely(file->f_flags & O_DIRECT)) {
1766 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1792 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1767 pos, ppos, count, ocount); 1793 pos, count, ocount);
1768 } else { 1794 } else {
1769 struct iov_iter i; 1795 struct iov_iter i;
1770 1796
@@ -1772,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1772 1798
1773 num_written = __btrfs_buffered_write(file, &i, pos); 1799 num_written = __btrfs_buffered_write(file, &i, pos);
1774 if (num_written > 0) 1800 if (num_written > 0)
1775 *ppos = pos + num_written; 1801 iocb->ki_pos = pos + num_written;
1776 } 1802 }
1777 1803
1778 mutex_unlock(&inode->i_mutex); 1804 mutex_unlock(&inode->i_mutex);
@@ -1797,7 +1823,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1797 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1823 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1798 if (num_written > 0) { 1824 if (num_written > 0) {
1799 err = generic_write_sync(file, pos, num_written); 1825 err = generic_write_sync(file, pos, num_written);
1800 if (err < 0 && num_written > 0) 1826 if (err < 0)
1801 num_written = err; 1827 num_written = err;
1802 } 1828 }
1803 1829
@@ -1856,8 +1882,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1856 struct dentry *dentry = file->f_path.dentry; 1882 struct dentry *dentry = file->f_path.dentry;
1857 struct inode *inode = dentry->d_inode; 1883 struct inode *inode = dentry->d_inode;
1858 struct btrfs_root *root = BTRFS_I(inode)->root; 1884 struct btrfs_root *root = BTRFS_I(inode)->root;
1859 int ret = 0;
1860 struct btrfs_trans_handle *trans; 1885 struct btrfs_trans_handle *trans;
1886 struct btrfs_log_ctx ctx;
1887 int ret = 0;
1861 bool full_sync = 0; 1888 bool full_sync = 0;
1862 1889
1863 trace_btrfs_sync_file(file, datasync); 1890 trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1978,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1951 } 1978 }
1952 trans->sync = true; 1979 trans->sync = true;
1953 1980
1954 ret = btrfs_log_dentry_safe(trans, root, dentry); 1981 btrfs_init_log_ctx(&ctx);
1982
1983 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1955 if (ret < 0) { 1984 if (ret < 0) {
1956 /* Fallthrough and commit/free transaction. */ 1985 /* Fallthrough and commit/free transaction. */
1957 ret = 1; 1986 ret = 1;
@@ -1971,7 +2000,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1971 2000
1972 if (ret != BTRFS_NO_LOG_SYNC) { 2001 if (ret != BTRFS_NO_LOG_SYNC) {
1973 if (!ret) { 2002 if (!ret) {
1974 ret = btrfs_sync_log(trans, root); 2003 ret = btrfs_sync_log(trans, root, &ctx);
1975 if (!ret) { 2004 if (!ret) {
1976 ret = btrfs_end_transaction(trans, root); 2005 ret = btrfs_end_transaction(trans, root);
1977 goto out; 2006 goto out;
@@ -1993,6 +2022,7 @@ out:
1993 2022
1994static const struct vm_operations_struct btrfs_file_vm_ops = { 2023static const struct vm_operations_struct btrfs_file_vm_ops = {
1995 .fault = filemap_fault, 2024 .fault = filemap_fault,
2025 .map_pages = filemap_map_pages,
1996 .page_mkwrite = btrfs_page_mkwrite, 2026 .page_mkwrite = btrfs_page_mkwrite,
1997 .remap_pages = generic_file_remap_pages, 2027 .remap_pages = generic_file_remap_pages,
1998}; 2028};
@@ -2157,6 +2187,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2157 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2187 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2158 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2188 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2159 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2189 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2190 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2160 2191
2161 ret = btrfs_wait_ordered_range(inode, offset, len); 2192 ret = btrfs_wait_ordered_range(inode, offset, len);
2162 if (ret) 2193 if (ret)
@@ -2172,14 +2203,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2172 * entire page. 2203 * entire page.
2173 */ 2204 */
2174 if (same_page && len < PAGE_CACHE_SIZE) { 2205 if (same_page && len < PAGE_CACHE_SIZE) {
2175 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 2206 if (offset < ino_size)
2176 ret = btrfs_truncate_page(inode, offset, len, 0); 2207 ret = btrfs_truncate_page(inode, offset, len, 0);
2177 mutex_unlock(&inode->i_mutex); 2208 mutex_unlock(&inode->i_mutex);
2178 return ret; 2209 return ret;
2179 } 2210 }
2180 2211
2181 /* zero back part of the first page */ 2212 /* zero back part of the first page */
2182 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2213 if (offset < ino_size) {
2183 ret = btrfs_truncate_page(inode, offset, 0, 0); 2214 ret = btrfs_truncate_page(inode, offset, 0, 0);
2184 if (ret) { 2215 if (ret) {
2185 mutex_unlock(&inode->i_mutex); 2216 mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2219,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2188 } 2219 }
2189 2220
2190 /* zero the front end of the last page */ 2221 /* zero the front end of the last page */
2191 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2222 if (offset + len < ino_size) {
2192 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2223 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2193 if (ret) { 2224 if (ret) {
2194 mutex_unlock(&inode->i_mutex); 2225 mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2308,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2277 2308
2278 trans->block_rsv = &root->fs_info->trans_block_rsv; 2309 trans->block_rsv = &root->fs_info->trans_block_rsv;
2279 2310
2280 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2311 if (cur_offset < ino_size) {
2281 if (ret) { 2312 ret = fill_holes(trans, inode, path, cur_offset,
2282 err = ret; 2313 drop_end);
2283 break; 2314 if (ret) {
2315 err = ret;
2316 break;
2317 }
2284 } 2318 }
2285 2319
2286 cur_offset = drop_end; 2320 cur_offset = drop_end;
@@ -2313,10 +2347,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2313 } 2347 }
2314 2348
2315 trans->block_rsv = &root->fs_info->trans_block_rsv; 2349 trans->block_rsv = &root->fs_info->trans_block_rsv;
2316 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2350 if (cur_offset < ino_size) {
2317 if (ret) { 2351 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2318 err = ret; 2352 if (ret) {
2319 goto out_trans; 2353 err = ret;
2354 goto out_trans;
2355 }
2320 } 2356 }
2321 2357
2322out_trans: 2358out_trans:
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ab485e57b6fe..cc8ca193d830 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -55,7 +55,7 @@ static int caching_kthread(void *data)
55 key.type = BTRFS_INODE_ITEM_KEY; 55 key.type = BTRFS_INODE_ITEM_KEY;
56again: 56again:
57 /* need to make sure the commit_root doesn't disappear */ 57 /* need to make sure the commit_root doesn't disappear */
58 mutex_lock(&root->fs_commit_mutex); 58 down_read(&fs_info->commit_root_sem);
59 59
60 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 60 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
61 if (ret < 0) 61 if (ret < 0)
@@ -88,7 +88,7 @@ again:
88 btrfs_item_key_to_cpu(leaf, &key, 0); 88 btrfs_item_key_to_cpu(leaf, &key, 0);
89 btrfs_release_path(path); 89 btrfs_release_path(path);
90 root->cache_progress = last; 90 root->cache_progress = last;
91 mutex_unlock(&root->fs_commit_mutex); 91 up_read(&fs_info->commit_root_sem);
92 schedule_timeout(1); 92 schedule_timeout(1);
93 goto again; 93 goto again;
94 } else 94 } else
@@ -127,7 +127,7 @@ next:
127 btrfs_unpin_free_ino(root); 127 btrfs_unpin_free_ino(root);
128out: 128out:
129 wake_up(&root->cache_wait); 129 wake_up(&root->cache_wait);
130 mutex_unlock(&root->fs_commit_mutex); 130 up_read(&fs_info->commit_root_sem);
131 131
132 btrfs_free_path(path); 132 btrfs_free_path(path);
133 133
@@ -223,11 +223,11 @@ again:
223 * or the caching work is done. 223 * or the caching work is done.
224 */ 224 */
225 225
226 mutex_lock(&root->fs_commit_mutex); 226 down_write(&root->fs_info->commit_root_sem);
227 spin_lock(&root->cache_lock); 227 spin_lock(&root->cache_lock);
228 if (root->cached == BTRFS_CACHE_FINISHED) { 228 if (root->cached == BTRFS_CACHE_FINISHED) {
229 spin_unlock(&root->cache_lock); 229 spin_unlock(&root->cache_lock);
230 mutex_unlock(&root->fs_commit_mutex); 230 up_write(&root->fs_info->commit_root_sem);
231 goto again; 231 goto again;
232 } 232 }
233 spin_unlock(&root->cache_lock); 233 spin_unlock(&root->cache_lock);
@@ -240,7 +240,7 @@ again:
240 else 240 else
241 __btrfs_add_free_space(pinned, objectid, 1); 241 __btrfs_add_free_space(pinned, objectid, 1);
242 242
243 mutex_unlock(&root->fs_commit_mutex); 243 up_write(&root->fs_info->commit_root_sem);
244 } 244 }
245} 245}
246 246
@@ -250,7 +250,7 @@ again:
250 * and others will just be dropped, because the commit root we were 250 * and others will just be dropped, because the commit root we were
251 * searching has changed. 251 * searching has changed.
252 * 252 *
253 * Must be called with root->fs_commit_mutex held 253 * Must be called with root->fs_info->commit_root_sem held
254 */ 254 */
255void btrfs_unpin_free_ino(struct btrfs_root *root) 255void btrfs_unpin_free_ino(struct btrfs_root *root)
256{ 256{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d44486290b..5f805bc944fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode,
394 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 394 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
395 btrfs_add_inode_defrag(NULL, inode); 395 btrfs_add_inode_defrag(NULL, inode);
396 396
397 /*
398 * skip compression for a small file range(<=blocksize) that
399 * isn't an inline extent, since it dosen't save disk space at all.
400 */
401 if ((end - start + 1) <= blocksize &&
402 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
403 goto cleanup_and_bail_uncompressed;
404
397 actual_end = min_t(u64, isize, end + 1); 405 actual_end = min_t(u64, isize, end + 1);
398again: 406again:
399 will_compress = 0; 407 will_compress = 0;
@@ -864,7 +872,8 @@ static noinline int cow_file_range(struct inode *inode,
864 872
865 if (btrfs_is_free_space_inode(inode)) { 873 if (btrfs_is_free_space_inode(inode)) {
866 WARN_ON_ONCE(1); 874 WARN_ON_ONCE(1);
867 return -EINVAL; 875 ret = -EINVAL;
876 goto out_unlock;
868 } 877 }
869 878
870 num_bytes = ALIGN(end - start + 1, blocksize); 879 num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1075 async_cow->end = cur_end; 1084 async_cow->end = cur_end;
1076 INIT_LIST_HEAD(&async_cow->extents); 1085 INIT_LIST_HEAD(&async_cow->extents);
1077 1086
1078 async_cow->work.func = async_cow_start; 1087 btrfs_init_work(&async_cow->work, async_cow_start,
1079 async_cow->work.ordered_func = async_cow_submit; 1088 async_cow_submit, async_cow_free);
1080 async_cow->work.ordered_free = async_cow_free;
1081 async_cow->work.flags = 0;
1082 1089
1083 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1090 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1084 PAGE_CACHE_SHIFT; 1091 PAGE_CACHE_SHIFT;
1085 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1092 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1086 1093
1087 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1094 btrfs_queue_work(root->fs_info->delalloc_workers,
1088 &async_cow->work); 1095 &async_cow->work);
1089 1096
1090 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1097 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1091 wait_event(root->fs_info->async_submit_wait, 1098 wait_event(root->fs_info->async_submit_wait,
@@ -1272,6 +1279,15 @@ next_slot:
1272 disk_bytenr += cur_offset - found_key.offset; 1279 disk_bytenr += cur_offset - found_key.offset;
1273 num_bytes = min(end + 1, extent_end) - cur_offset; 1280 num_bytes = min(end + 1, extent_end) - cur_offset;
1274 /* 1281 /*
1282 * if there are pending snapshots for this root,
1283 * we fall into common COW way.
1284 */
1285 if (!nolock) {
1286 err = btrfs_start_nocow_write(root);
1287 if (!err)
1288 goto out_check;
1289 }
1290 /*
1275 * force cow if csum exists in the range. 1291 * force cow if csum exists in the range.
1276 * this ensure that csum for a given extent are 1292 * this ensure that csum for a given extent are
1277 * either valid or do not exist. 1293 * either valid or do not exist.
@@ -1290,6 +1306,8 @@ next_slot:
1290out_check: 1306out_check:
1291 if (extent_end <= start) { 1307 if (extent_end <= start) {
1292 path->slots[0]++; 1308 path->slots[0]++;
1309 if (!nolock && nocow)
1310 btrfs_end_nocow_write(root);
1293 goto next_slot; 1311 goto next_slot;
1294 } 1312 }
1295 if (!nocow) { 1313 if (!nocow) {
@@ -1307,8 +1325,11 @@ out_check:
1307 ret = cow_file_range(inode, locked_page, 1325 ret = cow_file_range(inode, locked_page,
1308 cow_start, found_key.offset - 1, 1326 cow_start, found_key.offset - 1,
1309 page_started, nr_written, 1); 1327 page_started, nr_written, 1);
1310 if (ret) 1328 if (ret) {
1329 if (!nolock && nocow)
1330 btrfs_end_nocow_write(root);
1311 goto error; 1331 goto error;
1332 }
1312 cow_start = (u64)-1; 1333 cow_start = (u64)-1;
1313 } 1334 }
1314 1335
@@ -1355,8 +1376,11 @@ out_check:
1355 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1376 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1356 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1377 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1357 num_bytes); 1378 num_bytes);
1358 if (ret) 1379 if (ret) {
1380 if (!nolock && nocow)
1381 btrfs_end_nocow_write(root);
1359 goto error; 1382 goto error;
1383 }
1360 } 1384 }
1361 1385
1362 extent_clear_unlock_delalloc(inode, cur_offset, 1386 extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1364,6 +1388,8 @@ out_check:
1364 locked_page, EXTENT_LOCKED | 1388 locked_page, EXTENT_LOCKED |
1365 EXTENT_DELALLOC, PAGE_UNLOCK | 1389 EXTENT_DELALLOC, PAGE_UNLOCK |
1366 PAGE_SET_PRIVATE2); 1390 PAGE_SET_PRIVATE2);
1391 if (!nolock && nocow)
1392 btrfs_end_nocow_write(root);
1367 cur_offset = extent_end; 1393 cur_offset = extent_end;
1368 if (cur_offset > end) 1394 if (cur_offset > end)
1369 break; 1395 break;
@@ -1843,9 +1869,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1843 1869
1844 SetPageChecked(page); 1870 SetPageChecked(page);
1845 page_cache_get(page); 1871 page_cache_get(page);
1846 fixup->work.func = btrfs_writepage_fixup_worker; 1872 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
1847 fixup->page = page; 1873 fixup->page = page;
1848 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1874 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1849 return -EBUSY; 1875 return -EBUSY;
1850} 1876}
1851 1877
@@ -2239,6 +2265,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2239 return PTR_ERR(root); 2265 return PTR_ERR(root);
2240 } 2266 }
2241 2267
2268 if (btrfs_root_readonly(root)) {
2269 srcu_read_unlock(&fs_info->subvol_srcu, index);
2270 return 0;
2271 }
2272
2242 /* step 2: get inode */ 2273 /* step 2: get inode */
2243 key.objectid = backref->inum; 2274 key.objectid = backref->inum;
2244 key.type = BTRFS_INODE_ITEM_KEY; 2275 key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2790,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2759 struct inode *inode = page->mapping->host; 2790 struct inode *inode = page->mapping->host;
2760 struct btrfs_root *root = BTRFS_I(inode)->root; 2791 struct btrfs_root *root = BTRFS_I(inode)->root;
2761 struct btrfs_ordered_extent *ordered_extent = NULL; 2792 struct btrfs_ordered_extent *ordered_extent = NULL;
2762 struct btrfs_workers *workers; 2793 struct btrfs_workqueue *workers;
2763 2794
2764 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2795 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2765 2796
@@ -2768,14 +2799,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2768 end - start + 1, uptodate)) 2799 end - start + 1, uptodate))
2769 return 0; 2800 return 0;
2770 2801
2771 ordered_extent->work.func = finish_ordered_fn; 2802 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2772 ordered_extent->work.flags = 0;
2773 2803
2774 if (btrfs_is_free_space_inode(inode)) 2804 if (btrfs_is_free_space_inode(inode))
2775 workers = &root->fs_info->endio_freespace_worker; 2805 workers = root->fs_info->endio_freespace_worker;
2776 else 2806 else
2777 workers = &root->fs_info->endio_write_workers; 2807 workers = root->fs_info->endio_write_workers;
2778 btrfs_queue_worker(workers, &ordered_extent->work); 2808 btrfs_queue_work(workers, &ordered_extent->work);
2779 2809
2780 return 0; 2810 return 0;
2781} 2811}
@@ -4593,7 +4623,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4593 struct rb_node *node; 4623 struct rb_node *node;
4594 4624
4595 ASSERT(inode->i_state & I_FREEING); 4625 ASSERT(inode->i_state & I_FREEING);
4596 truncate_inode_pages(&inode->i_data, 0); 4626 truncate_inode_pages_final(&inode->i_data);
4597 4627
4598 write_lock(&map_tree->lock); 4628 write_lock(&map_tree->lock);
4599 while (!RB_EMPTY_ROOT(&map_tree->map)) { 4629 while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4954,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
4924 struct inode *inode; 4954 struct inode *inode;
4925 u64 objectid = 0; 4955 u64 objectid = 0;
4926 4956
4927 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4957 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
4958 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4928 4959
4929 spin_lock(&root->inode_lock); 4960 spin_lock(&root->inode_lock);
4930again: 4961again:
@@ -5799,6 +5830,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5799 } 5830 }
5800out_unlock: 5831out_unlock:
5801 btrfs_end_transaction(trans, root); 5832 btrfs_end_transaction(trans, root);
5833 btrfs_balance_delayed_items(root);
5802 btrfs_btree_balance_dirty(root); 5834 btrfs_btree_balance_dirty(root);
5803 if (drop_inode) { 5835 if (drop_inode) {
5804 inode_dec_link_count(inode); 5836 inode_dec_link_count(inode);
@@ -5872,6 +5904,7 @@ out_unlock:
5872 inode_dec_link_count(inode); 5904 inode_dec_link_count(inode);
5873 iput(inode); 5905 iput(inode);
5874 } 5906 }
5907 btrfs_balance_delayed_items(root);
5875 btrfs_btree_balance_dirty(root); 5908 btrfs_btree_balance_dirty(root);
5876 return err; 5909 return err;
5877} 5910}
@@ -5930,6 +5963,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5930 } 5963 }
5931 5964
5932 btrfs_end_transaction(trans, root); 5965 btrfs_end_transaction(trans, root);
5966 btrfs_balance_delayed_items(root);
5933fail: 5967fail:
5934 if (drop_inode) { 5968 if (drop_inode) {
5935 inode_dec_link_count(inode); 5969 inode_dec_link_count(inode);
@@ -5996,6 +6030,7 @@ out_fail:
5996 btrfs_end_transaction(trans, root); 6030 btrfs_end_transaction(trans, root);
5997 if (drop_on_err) 6031 if (drop_on_err)
5998 iput(inode); 6032 iput(inode);
6033 btrfs_balance_delayed_items(root);
5999 btrfs_btree_balance_dirty(root); 6034 btrfs_btree_balance_dirty(root);
6000 return err; 6035 return err;
6001} 6036}
@@ -6550,6 +6585,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6550 int ret; 6585 int ret;
6551 struct extent_buffer *leaf; 6586 struct extent_buffer *leaf;
6552 struct btrfs_root *root = BTRFS_I(inode)->root; 6587 struct btrfs_root *root = BTRFS_I(inode)->root;
6588 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6553 struct btrfs_file_extent_item *fi; 6589 struct btrfs_file_extent_item *fi;
6554 struct btrfs_key key; 6590 struct btrfs_key key;
6555 u64 disk_bytenr; 6591 u64 disk_bytenr;
@@ -6626,6 +6662,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6626 6662
6627 if (btrfs_extent_readonly(root, disk_bytenr)) 6663 if (btrfs_extent_readonly(root, disk_bytenr))
6628 goto out; 6664 goto out;
6665
6666 num_bytes = min(offset + *len, extent_end) - offset;
6667 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6668 u64 range_end;
6669
6670 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6671 ret = test_range_bit(io_tree, offset, range_end,
6672 EXTENT_DELALLOC, 0, NULL);
6673 if (ret) {
6674 ret = -EAGAIN;
6675 goto out;
6676 }
6677 }
6678
6629 btrfs_release_path(path); 6679 btrfs_release_path(path);
6630 6680
6631 /* 6681 /*
@@ -6654,7 +6704,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6654 */ 6704 */
6655 disk_bytenr += backref_offset; 6705 disk_bytenr += backref_offset;
6656 disk_bytenr += offset - key.offset; 6706 disk_bytenr += offset - key.offset;
6657 num_bytes = min(offset + *len, extent_end) - offset;
6658 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6707 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6659 goto out; 6708 goto out;
6660 /* 6709 /*
@@ -7024,10 +7073,9 @@ again:
7024 if (!ret) 7073 if (!ret)
7025 goto out_test; 7074 goto out_test;
7026 7075
7027 ordered->work.func = finish_ordered_fn; 7076 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
7028 ordered->work.flags = 0; 7077 btrfs_queue_work(root->fs_info->endio_write_workers,
7029 btrfs_queue_worker(&root->fs_info->endio_write_workers, 7078 &ordered->work);
7030 &ordered->work);
7031out_test: 7079out_test:
7032 /* 7080 /*
7033 * our bio might span multiple ordered extents. If we haven't 7081 * our bio might span multiple ordered extents. If we haven't
@@ -7404,15 +7452,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7404 smp_mb__after_atomic_inc(); 7452 smp_mb__after_atomic_inc();
7405 7453
7406 /* 7454 /*
7407 * The generic stuff only does filemap_write_and_wait_range, which isn't 7455 * The generic stuff only does filemap_write_and_wait_range, which
7408 * enough if we've written compressed pages to this area, so we need to 7456 * isn't enough if we've written compressed pages to this area, so
7409 * call btrfs_wait_ordered_range to make absolutely sure that any 7457 * we need to flush the dirty pages again to make absolutely sure
7410 * outstanding dirty pages are on disk. 7458 * that any outstanding dirty pages are on disk.
7411 */ 7459 */
7412 count = iov_length(iov, nr_segs); 7460 count = iov_length(iov, nr_segs);
7413 ret = btrfs_wait_ordered_range(inode, offset, count); 7461 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7414 if (ret) 7462 &BTRFS_I(inode)->runtime_flags))
7415 return ret; 7463 filemap_fdatawrite_range(inode->i_mapping, offset, count);
7416 7464
7417 if (rw & WRITE) { 7465 if (rw & WRITE) {
7418 /* 7466 /*
@@ -8404,7 +8452,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8404 work->inode = inode; 8452 work->inode = inode;
8405 work->wait = wait; 8453 work->wait = wait;
8406 work->delay_iput = delay_iput; 8454 work->delay_iput = delay_iput;
8407 work->work.func = btrfs_run_delalloc_work; 8455 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
8408 8456
8409 return work; 8457 return work;
8410} 8458}
@@ -8419,7 +8467,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8419 * some fairly slow code that needs optimization. This walks the list 8467 * some fairly slow code that needs optimization. This walks the list
8420 * of all the inodes with pending delalloc and forces them to disk. 8468 * of all the inodes with pending delalloc and forces them to disk.
8421 */ 8469 */
8422static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8470static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8471 int nr)
8423{ 8472{
8424 struct btrfs_inode *binode; 8473 struct btrfs_inode *binode;
8425 struct inode *inode; 8474 struct inode *inode;
@@ -8431,6 +8480,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8431 INIT_LIST_HEAD(&works); 8480 INIT_LIST_HEAD(&works);
8432 INIT_LIST_HEAD(&splice); 8481 INIT_LIST_HEAD(&splice);
8433 8482
8483 mutex_lock(&root->delalloc_mutex);
8434 spin_lock(&root->delalloc_lock); 8484 spin_lock(&root->delalloc_lock);
8435 list_splice_init(&root->delalloc_inodes, &splice); 8485 list_splice_init(&root->delalloc_inodes, &splice);
8436 while (!list_empty(&splice)) { 8486 while (!list_empty(&splice)) {
@@ -8456,19 +8506,16 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8456 goto out; 8506 goto out;
8457 } 8507 }
8458 list_add_tail(&work->list, &works); 8508 list_add_tail(&work->list, &works);
8459 btrfs_queue_worker(&root->fs_info->flush_workers, 8509 btrfs_queue_work(root->fs_info->flush_workers,
8460 &work->work); 8510 &work->work);
8461 8511 ret++;
8512 if (nr != -1 && ret >= nr)
8513 goto out;
8462 cond_resched(); 8514 cond_resched();
8463 spin_lock(&root->delalloc_lock); 8515 spin_lock(&root->delalloc_lock);
8464 } 8516 }
8465 spin_unlock(&root->delalloc_lock); 8517 spin_unlock(&root->delalloc_lock);
8466 8518
8467 list_for_each_entry_safe(work, next, &works, list) {
8468 list_del_init(&work->list);
8469 btrfs_wait_and_free_delalloc_work(work);
8470 }
8471 return 0;
8472out: 8519out:
8473 list_for_each_entry_safe(work, next, &works, list) { 8520 list_for_each_entry_safe(work, next, &works, list) {
8474 list_del_init(&work->list); 8521 list_del_init(&work->list);
@@ -8480,6 +8527,7 @@ out:
8480 list_splice_tail(&splice, &root->delalloc_inodes); 8527 list_splice_tail(&splice, &root->delalloc_inodes);
8481 spin_unlock(&root->delalloc_lock); 8528 spin_unlock(&root->delalloc_lock);
8482 } 8529 }
8530 mutex_unlock(&root->delalloc_mutex);
8483 return ret; 8531 return ret;
8484} 8532}
8485 8533
@@ -8490,7 +8538,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8490 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 8538 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8491 return -EROFS; 8539 return -EROFS;
8492 8540
8493 ret = __start_delalloc_inodes(root, delay_iput); 8541 ret = __start_delalloc_inodes(root, delay_iput, -1);
8542 if (ret > 0)
8543 ret = 0;
8494 /* 8544 /*
8495 * the filemap_flush will queue IO into the worker threads, but 8545 * the filemap_flush will queue IO into the worker threads, but
8496 * we have to make sure the IO is actually started and that 8546 * we have to make sure the IO is actually started and that
@@ -8507,7 +8557,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8507 return ret; 8557 return ret;
8508} 8558}
8509 8559
8510int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) 8560int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8561 int nr)
8511{ 8562{
8512 struct btrfs_root *root; 8563 struct btrfs_root *root;
8513 struct list_head splice; 8564 struct list_head splice;
@@ -8518,9 +8569,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8518 8569
8519 INIT_LIST_HEAD(&splice); 8570 INIT_LIST_HEAD(&splice);
8520 8571
8572 mutex_lock(&fs_info->delalloc_root_mutex);
8521 spin_lock(&fs_info->delalloc_root_lock); 8573 spin_lock(&fs_info->delalloc_root_lock);
8522 list_splice_init(&fs_info->delalloc_roots, &splice); 8574 list_splice_init(&fs_info->delalloc_roots, &splice);
8523 while (!list_empty(&splice)) { 8575 while (!list_empty(&splice) && nr) {
8524 root = list_first_entry(&splice, struct btrfs_root, 8576 root = list_first_entry(&splice, struct btrfs_root,
8525 delalloc_root); 8577 delalloc_root);
8526 root = btrfs_grab_fs_root(root); 8578 root = btrfs_grab_fs_root(root);
@@ -8529,15 +8581,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8529 &fs_info->delalloc_roots); 8581 &fs_info->delalloc_roots);
8530 spin_unlock(&fs_info->delalloc_root_lock); 8582 spin_unlock(&fs_info->delalloc_root_lock);
8531 8583
8532 ret = __start_delalloc_inodes(root, delay_iput); 8584 ret = __start_delalloc_inodes(root, delay_iput, nr);
8533 btrfs_put_fs_root(root); 8585 btrfs_put_fs_root(root);
8534 if (ret) 8586 if (ret < 0)
8535 goto out; 8587 goto out;
8536 8588
8589 if (nr != -1) {
8590 nr -= ret;
8591 WARN_ON(nr < 0);
8592 }
8537 spin_lock(&fs_info->delalloc_root_lock); 8593 spin_lock(&fs_info->delalloc_root_lock);
8538 } 8594 }
8539 spin_unlock(&fs_info->delalloc_root_lock); 8595 spin_unlock(&fs_info->delalloc_root_lock);
8540 8596
8597 ret = 0;
8541 atomic_inc(&fs_info->async_submit_draining); 8598 atomic_inc(&fs_info->async_submit_draining);
8542 while (atomic_read(&fs_info->nr_async_submits) || 8599 while (atomic_read(&fs_info->nr_async_submits) ||
8543 atomic_read(&fs_info->async_delalloc_pages)) { 8600 atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8603,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8546 atomic_read(&fs_info->async_delalloc_pages) == 0)); 8603 atomic_read(&fs_info->async_delalloc_pages) == 0));
8547 } 8604 }
8548 atomic_dec(&fs_info->async_submit_draining); 8605 atomic_dec(&fs_info->async_submit_draining);
8549 return 0;
8550out: 8606out:
8551 if (!list_empty_careful(&splice)) { 8607 if (!list_empty_careful(&splice)) {
8552 spin_lock(&fs_info->delalloc_root_lock); 8608 spin_lock(&fs_info->delalloc_root_lock);
8553 list_splice_tail(&splice, &fs_info->delalloc_roots); 8609 list_splice_tail(&splice, &fs_info->delalloc_roots);
8554 spin_unlock(&fs_info->delalloc_root_lock); 8610 spin_unlock(&fs_info->delalloc_root_lock);
8555 } 8611 }
8612 mutex_unlock(&fs_info->delalloc_root_mutex);
8556 return ret; 8613 return ret;
8557} 8614}
8558 8615
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..e79ff6b90cb7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61 61
62#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
64 * structures are incorrect, as the timespec structure from userspace
65 * is 4 bytes too small. We define these alternatives here to teach
66 * the kernel about the 32-bit struct packing.
67 */
68struct btrfs_ioctl_timespec_32 {
69 __u64 sec;
70 __u32 nsec;
71} __attribute__ ((__packed__));
72
73struct btrfs_ioctl_received_subvol_args_32 {
74 char uuid[BTRFS_UUID_SIZE]; /* in */
75 __u64 stransid; /* in */
76 __u64 rtransid; /* out */
77 struct btrfs_ioctl_timespec_32 stime; /* in */
78 struct btrfs_ioctl_timespec_32 rtime; /* out */
79 __u64 flags; /* in */
80 __u64 reserved[16]; /* in */
81} __attribute__ ((__packed__));
82
83#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
84 struct btrfs_ioctl_received_subvol_args_32)
85#endif
86
87
62static int btrfs_clone(struct inode *src, struct inode *inode, 88static int btrfs_clone(struct inode *src, struct inode *inode,
63 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 89 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
64 90
@@ -585,6 +611,23 @@ fail:
585 return ret; 611 return ret;
586} 612}
587 613
614static void btrfs_wait_nocow_write(struct btrfs_root *root)
615{
616 s64 writers;
617 DEFINE_WAIT(wait);
618
619 do {
620 prepare_to_wait(&root->subv_writers->wait, &wait,
621 TASK_UNINTERRUPTIBLE);
622
623 writers = percpu_counter_sum(&root->subv_writers->counter);
624 if (writers)
625 schedule();
626
627 finish_wait(&root->subv_writers->wait, &wait);
628 } while (writers);
629}
630
588static int create_snapshot(struct btrfs_root *root, struct inode *dir, 631static int create_snapshot(struct btrfs_root *root, struct inode *dir,
589 struct dentry *dentry, char *name, int namelen, 632 struct dentry *dentry, char *name, int namelen,
590 u64 *async_transid, bool readonly, 633 u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
598 if (!root->ref_cows) 641 if (!root->ref_cows)
599 return -EINVAL; 642 return -EINVAL;
600 643
644 atomic_inc(&root->will_be_snapshoted);
645 smp_mb__after_atomic_inc();
646 btrfs_wait_nocow_write(root);
647
601 ret = btrfs_start_delalloc_inodes(root, 0); 648 ret = btrfs_start_delalloc_inodes(root, 0);
602 if (ret) 649 if (ret)
603 return ret; 650 goto out;
604 651
605 btrfs_wait_ordered_extents(root, -1); 652 btrfs_wait_ordered_extents(root, -1);
606 653
607 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 654 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
608 if (!pending_snapshot) 655 if (!pending_snapshot) {
609 return -ENOMEM; 656 ret = -ENOMEM;
657 goto out;
658 }
610 659
611 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 660 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
612 BTRFS_BLOCK_RSV_TEMP); 661 BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
623 &pending_snapshot->qgroup_reserved, 672 &pending_snapshot->qgroup_reserved,
624 false); 673 false);
625 if (ret) 674 if (ret)
626 goto out; 675 goto free;
627 676
628 pending_snapshot->dentry = dentry; 677 pending_snapshot->dentry = dentry;
629 pending_snapshot->root = root; 678 pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
674 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 723 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
675 &pending_snapshot->block_rsv, 724 &pending_snapshot->block_rsv,
676 pending_snapshot->qgroup_reserved); 725 pending_snapshot->qgroup_reserved);
677out: 726free:
678 kfree(pending_snapshot); 727 kfree(pending_snapshot);
728out:
729 atomic_dec(&root->will_be_snapshoted);
679 return ret; 730 return ret;
680} 731}
681 732
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
884 min_key.type = BTRFS_EXTENT_DATA_KEY; 935 min_key.type = BTRFS_EXTENT_DATA_KEY;
885 min_key.offset = *off; 936 min_key.offset = *off;
886 937
887 path->keep_locks = 1;
888
889 while (1) { 938 while (1) {
939 path->keep_locks = 1;
890 ret = btrfs_search_forward(root, &min_key, path, newer_than); 940 ret = btrfs_search_forward(root, &min_key, path, newer_than);
891 if (ret != 0) 941 if (ret != 0)
892 goto none; 942 goto none;
943 path->keep_locks = 0;
944 btrfs_unlock_up_safe(path, 1);
945process_slot:
893 if (min_key.objectid != ino) 946 if (min_key.objectid != ino)
894 goto none; 947 goto none;
895 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 948 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
908 return 0; 961 return 0;
909 } 962 }
910 963
964 path->slots[0]++;
965 if (path->slots[0] < btrfs_header_nritems(leaf)) {
966 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
967 goto process_slot;
968 }
969
911 if (min_key.offset == (u64)-1) 970 if (min_key.offset == (u64)-1)
912 goto none; 971 goto none;
913 972
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
935 read_unlock(&em_tree->lock); 994 read_unlock(&em_tree->lock);
936 995
937 if (!em) { 996 if (!em) {
997 struct extent_state *cached = NULL;
998 u64 end = start + len - 1;
999
938 /* get the big lock and read metadata off disk */ 1000 /* get the big lock and read metadata off disk */
939 lock_extent(io_tree, start, start + len - 1); 1001 lock_extent_bits(io_tree, start, end, 0, &cached);
940 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1002 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
941 unlock_extent(io_tree, start, start + len - 1); 1003 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
942 1004
943 if (IS_ERR(em)) 1005 if (IS_ERR(em))
944 return NULL; 1006 return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
957 return false; 1019 return false;
958 1020
959 next = defrag_lookup_extent(inode, em->start + em->len); 1021 next = defrag_lookup_extent(inode, em->start + em->len);
960 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
1023 (em->block_start + em->block_len == next->block_start))
961 ret = false; 1024 ret = false;
962 1025
963 free_extent_map(next); 1026 free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
1076 page_start = page_offset(page); 1139 page_start = page_offset(page);
1077 page_end = page_start + PAGE_CACHE_SIZE - 1; 1140 page_end = page_start + PAGE_CACHE_SIZE - 1;
1078 while (1) { 1141 while (1) {
1079 lock_extent(tree, page_start, page_end); 1142 lock_extent_bits(tree, page_start, page_end,
1143 0, &cached_state);
1080 ordered = btrfs_lookup_ordered_extent(inode, 1144 ordered = btrfs_lookup_ordered_extent(inode,
1081 page_start); 1145 page_start);
1082 unlock_extent(tree, page_start, page_end); 1146 unlock_extent_cached(tree, page_start, page_end,
1147 &cached_state, GFP_NOFS);
1083 if (!ordered) 1148 if (!ordered)
1084 break; 1149 break;
1085 1150
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1356 } 1421 }
1357 } 1422 }
1358 1423
1359 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1424 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
1360 filemap_flush(inode->i_mapping); 1425 filemap_flush(inode->i_mapping);
1426 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1427 &BTRFS_I(inode)->runtime_flags))
1428 filemap_flush(inode->i_mapping);
1429 }
1361 1430
1362 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1431 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1363 /* the filemap_flush will queue IO into the worker threads, but 1432 /* the filemap_flush will queue IO into the worker threads, but
@@ -1403,6 +1472,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1403 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1404 struct btrfs_device *device = NULL; 1473 struct btrfs_device *device = NULL;
1405 char *sizestr; 1474 char *sizestr;
1475 char *retptr;
1406 char *devstr = NULL; 1476 char *devstr = NULL;
1407 int ret = 0; 1477 int ret = 0;
1408 int mod = 0; 1478 int mod = 0;
@@ -1470,8 +1540,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1470 mod = 1; 1540 mod = 1;
1471 sizestr++; 1541 sizestr++;
1472 } 1542 }
1473 new_size = memparse(sizestr, NULL); 1543 new_size = memparse(sizestr, &retptr);
1474 if (new_size == 0) { 1544 if (*retptr != '\0' || new_size == 0) {
1475 ret = -EINVAL; 1545 ret = -EINVAL;
1476 goto out_free; 1546 goto out_free;
1477 } 1547 }
@@ -1573,7 +1643,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1573 if (src_inode->i_sb != file_inode(file)->i_sb) { 1643 if (src_inode->i_sb != file_inode(file)->i_sb) {
1574 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1644 btrfs_info(BTRFS_I(src_inode)->root->fs_info,
1575 "Snapshot src from another FS"); 1645 "Snapshot src from another FS");
1576 ret = -EINVAL; 1646 ret = -EXDEV;
1577 } else if (!inode_owner_or_capable(src_inode)) { 1647 } else if (!inode_owner_or_capable(src_inode)) {
1578 /* 1648 /*
1579 * Subvolume creation is not restricted, but snapshots 1649 * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1867,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
1797 if (di && !IS_ERR(di)) { 1867 if (di && !IS_ERR(di)) {
1798 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1868 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1799 if (key.objectid == root->root_key.objectid) { 1869 if (key.objectid == root->root_key.objectid) {
1800 ret = -ENOTEMPTY; 1870 ret = -EPERM;
1871 btrfs_err(root->fs_info, "deleting default subvolume "
1872 "%llu is not allowed", key.objectid);
1801 goto out; 1873 goto out;
1802 } 1874 }
1803 btrfs_release_path(path); 1875 btrfs_release_path(path);
@@ -2994,8 +3066,9 @@ process_slot:
2994 new_key.offset + datal, 3066 new_key.offset + datal,
2995 1); 3067 1);
2996 if (ret) { 3068 if (ret) {
2997 btrfs_abort_transaction(trans, root, 3069 if (ret != -EINVAL)
2998 ret); 3070 btrfs_abort_transaction(trans,
3071 root, ret);
2999 btrfs_end_transaction(trans, root); 3072 btrfs_end_transaction(trans, root);
3000 goto out; 3073 goto out;
3001 } 3074 }
@@ -3068,8 +3141,9 @@ process_slot:
3068 new_key.offset + datal, 3141 new_key.offset + datal,
3069 1); 3142 1);
3070 if (ret) { 3143 if (ret) {
3071 btrfs_abort_transaction(trans, root, 3144 if (ret != -EINVAL)
3072 ret); 3145 btrfs_abort_transaction(trans,
3146 root, ret);
3073 btrfs_end_transaction(trans, root); 3147 btrfs_end_transaction(trans, root);
3074 goto out; 3148 goto out;
3075 } 3149 }
@@ -3153,8 +3227,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3153 * decompress into destination's address_space (the file offset 3227 * decompress into destination's address_space (the file offset
3154 * may change, so source mapping won't do), then recompress (or 3228 * may change, so source mapping won't do), then recompress (or
3155 * otherwise reinsert) a subrange. 3229 * otherwise reinsert) a subrange.
3156 * - allow ranges within the same file to be cloned (provided 3230 *
3157 * they don't overlap)? 3231 * - split destination inode's inline extents. The inline extents can
3232 * be either compressed or non-compressed.
3158 */ 3233 */
3159 3234
3160 /* the destination must be opened for writing */ 3235 /* the destination must be opened for writing */
@@ -3465,6 +3540,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
3465 up_read(&info->groups_sem); 3540 up_read(&info->groups_sem);
3466 } 3541 }
3467 3542
3543 /*
3544 * Global block reserve, exported as a space_info
3545 */
3546 slot_count++;
3547
3468 /* space_slots == 0 means they are asking for a count */ 3548 /* space_slots == 0 means they are asking for a count */
3469 if (space_args.space_slots == 0) { 3549 if (space_args.space_slots == 0) {
3470 space_args.total_spaces = slot_count; 3550 space_args.total_spaces = slot_count;
@@ -3523,6 +3603,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
3523 up_read(&info->groups_sem); 3603 up_read(&info->groups_sem);
3524 } 3604 }
3525 3605
3606 /*
3607 * Add global block reserve
3608 */
3609 if (slot_count) {
3610 struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
3611
3612 spin_lock(&block_rsv->lock);
3613 space.total_bytes = block_rsv->size;
3614 space.used_bytes = block_rsv->size - block_rsv->reserved;
3615 spin_unlock(&block_rsv->lock);
3616 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
3617 memcpy(dest, &space, sizeof(space));
3618 space_args.total_spaces++;
3619 }
3620
3526 user_dest = (struct btrfs_ioctl_space_info __user *) 3621 user_dest = (struct btrfs_ioctl_space_info __user *)
3527 (arg + sizeof(struct btrfs_ioctl_space_args)); 3622 (arg + sizeof(struct btrfs_ioctl_space_args));
3528 3623
@@ -4353,10 +4448,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
4353 return btrfs_qgroup_wait_for_completion(root->fs_info); 4448 return btrfs_qgroup_wait_for_completion(root->fs_info);
4354} 4449}
4355 4450
4356static long btrfs_ioctl_set_received_subvol(struct file *file, 4451static long _btrfs_ioctl_set_received_subvol(struct file *file,
4357 void __user *arg) 4452 struct btrfs_ioctl_received_subvol_args *sa)
4358{ 4453{
4359 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4360 struct inode *inode = file_inode(file); 4454 struct inode *inode = file_inode(file);
4361 struct btrfs_root *root = BTRFS_I(inode)->root; 4455 struct btrfs_root *root = BTRFS_I(inode)->root;
4362 struct btrfs_root_item *root_item = &root->root_item; 4456 struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4478,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4384 goto out; 4478 goto out;
4385 } 4479 }
4386 4480
4387 sa = memdup_user(arg, sizeof(*sa));
4388 if (IS_ERR(sa)) {
4389 ret = PTR_ERR(sa);
4390 sa = NULL;
4391 goto out;
4392 }
4393
4394 /* 4481 /*
4395 * 1 - root item 4482 * 1 - root item
4396 * 2 - uuid items (received uuid + subvol uuid) 4483 * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4531,90 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4444 goto out; 4531 goto out;
4445 } 4532 }
4446 4533
4534out:
4535 up_write(&root->fs_info->subvol_sem);
4536 mnt_drop_write_file(file);
4537 return ret;
4538}
4539
4540#ifdef CONFIG_64BIT
4541static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4542 void __user *arg)
4543{
4544 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4545 struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4546 int ret = 0;
4547
4548 args32 = memdup_user(arg, sizeof(*args32));
4549 if (IS_ERR(args32)) {
4550 ret = PTR_ERR(args32);
4551 args32 = NULL;
4552 goto out;
4553 }
4554
4555 args64 = kmalloc(sizeof(*args64), GFP_NOFS);
4556 if (!args64) {
4557 ret = -ENOMEM;
4558 goto out;
4559 }
4560
4561 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4562 args64->stransid = args32->stransid;
4563 args64->rtransid = args32->rtransid;
4564 args64->stime.sec = args32->stime.sec;
4565 args64->stime.nsec = args32->stime.nsec;
4566 args64->rtime.sec = args32->rtime.sec;
4567 args64->rtime.nsec = args32->rtime.nsec;
4568 args64->flags = args32->flags;
4569
4570 ret = _btrfs_ioctl_set_received_subvol(file, args64);
4571 if (ret)
4572 goto out;
4573
4574 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4575 args32->stransid = args64->stransid;
4576 args32->rtransid = args64->rtransid;
4577 args32->stime.sec = args64->stime.sec;
4578 args32->stime.nsec = args64->stime.nsec;
4579 args32->rtime.sec = args64->rtime.sec;
4580 args32->rtime.nsec = args64->rtime.nsec;
4581 args32->flags = args64->flags;
4582
4583 ret = copy_to_user(arg, args32, sizeof(*args32));
4584 if (ret)
4585 ret = -EFAULT;
4586
4587out:
4588 kfree(args32);
4589 kfree(args64);
4590 return ret;
4591}
4592#endif
4593
4594static long btrfs_ioctl_set_received_subvol(struct file *file,
4595 void __user *arg)
4596{
4597 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4598 int ret = 0;
4599
4600 sa = memdup_user(arg, sizeof(*sa));
4601 if (IS_ERR(sa)) {
4602 ret = PTR_ERR(sa);
4603 sa = NULL;
4604 goto out;
4605 }
4606
4607 ret = _btrfs_ioctl_set_received_subvol(file, sa);
4608
4609 if (ret)
4610 goto out;
4611
4447 ret = copy_to_user(arg, sa, sizeof(*sa)); 4612 ret = copy_to_user(arg, sa, sizeof(*sa));
4448 if (ret) 4613 if (ret)
4449 ret = -EFAULT; 4614 ret = -EFAULT;
4450 4615
4451out: 4616out:
4452 kfree(sa); 4617 kfree(sa);
4453 up_write(&root->fs_info->subvol_sem);
4454 mnt_drop_write_file(file);
4455 return ret; 4618 return ret;
4456} 4619}
4457 4620
@@ -4746,7 +4909,7 @@ long btrfs_ioctl(struct file *file, unsigned int
4746 case BTRFS_IOC_SYNC: { 4909 case BTRFS_IOC_SYNC: {
4747 int ret; 4910 int ret;
4748 4911
4749 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 4912 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
4750 if (ret) 4913 if (ret)
4751 return ret; 4914 return ret;
4752 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 4915 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4933,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4770 return btrfs_ioctl_balance_progress(root, argp); 4933 return btrfs_ioctl_balance_progress(root, argp);
4771 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 4934 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
4772 return btrfs_ioctl_set_received_subvol(file, argp); 4935 return btrfs_ioctl_set_received_subvol(file, argp);
4936#ifdef CONFIG_64BIT
4937 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
4938 return btrfs_ioctl_set_received_subvol_32(file, argp);
4939#endif
4773 case BTRFS_IOC_SEND: 4940 case BTRFS_IOC_SEND:
4774 return btrfs_ioctl_send(file, argp); 4941 return btrfs_ioctl_send(file, argp);
4775 case BTRFS_IOC_GET_DEV_STATS: 4942 case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
349 if (!uptodate) 349 if (!uptodate)
350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
351 351
352 if (entry->bytes_left == 0) 352 if (entry->bytes_left == 0) {
353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 else 354 if (waitqueue_active(&entry->wait))
355 wake_up(&entry->wait);
356 } else {
355 ret = 1; 357 ret = 1;
358 }
356out: 359out:
357 if (!ret && cached && entry) { 360 if (!ret && cached && entry) {
358 *cached = entry; 361 *cached = entry;
@@ -410,10 +413,13 @@ have_entry:
410 if (!uptodate) 413 if (!uptodate)
411 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 414 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
412 415
413 if (entry->bytes_left == 0) 416 if (entry->bytes_left == 0) {
414 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 417 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 else 418 if (waitqueue_active(&entry->wait))
419 wake_up(&entry->wait);
420 } else {
416 ret = 1; 421 ret = 1;
422 }
417out: 423out:
418 if (!ret && cached && entry) { 424 if (!ret && cached && entry) {
419 *cached = entry; 425 *cached = entry;
@@ -424,27 +430,48 @@ out:
424} 430}
425 431
426/* Needs to either be called under a log transaction or the log_mutex */ 432/* Needs to either be called under a log transaction or the log_mutex */
427void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) 433void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list)
428{ 435{
429 struct btrfs_ordered_inode_tree *tree; 436 struct btrfs_ordered_inode_tree *tree;
430 struct btrfs_ordered_extent *ordered; 437 struct btrfs_ordered_extent *ordered;
431 struct rb_node *n; 438 struct rb_node *n;
432 int index = log->log_transid % 2;
433 439
434 tree = &BTRFS_I(inode)->ordered_tree; 440 tree = &BTRFS_I(inode)->ordered_tree;
435 spin_lock_irq(&tree->lock); 441 spin_lock_irq(&tree->lock);
436 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
437 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
438 spin_lock(&log->log_extents_lock[index]); 444 if (!list_empty(&ordered->log_list))
439 if (list_empty(&ordered->log_list)) { 445 continue;
440 list_add_tail(&ordered->log_list, &log->logged_list[index]); 446 list_add_tail(&ordered->log_list, logged_list);
441 atomic_inc(&ordered->refs); 447 atomic_inc(&ordered->refs);
442 }
443 spin_unlock(&log->log_extents_lock[index]);
444 } 448 }
445 spin_unlock_irq(&tree->lock); 449 spin_unlock_irq(&tree->lock);
446} 450}
447 451
452void btrfs_put_logged_extents(struct list_head *logged_list)
453{
454 struct btrfs_ordered_extent *ordered;
455
456 while (!list_empty(logged_list)) {
457 ordered = list_first_entry(logged_list,
458 struct btrfs_ordered_extent,
459 log_list);
460 list_del_init(&ordered->log_list);
461 btrfs_put_ordered_extent(ordered);
462 }
463}
464
465void btrfs_submit_logged_extents(struct list_head *logged_list,
466 struct btrfs_root *log)
467{
468 int index = log->log_transid % 2;
469
470 spin_lock_irq(&log->log_extents_lock[index]);
471 list_splice_tail(logged_list, &log->logged_list[index]);
472 spin_unlock_irq(&log->log_extents_lock[index]);
473}
474
448void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
449{ 476{
450 struct btrfs_ordered_extent *ordered; 477 struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
577 INIT_LIST_HEAD(&splice); 604 INIT_LIST_HEAD(&splice);
578 INIT_LIST_HEAD(&works); 605 INIT_LIST_HEAD(&works);
579 606
580 mutex_lock(&root->fs_info->ordered_operations_mutex); 607 mutex_lock(&root->ordered_extent_mutex);
581 spin_lock(&root->ordered_extent_lock); 608 spin_lock(&root->ordered_extent_lock);
582 list_splice_init(&root->ordered_extents, &splice); 609 list_splice_init(&root->ordered_extents, &splice);
583 while (!list_empty(&splice) && nr) { 610 while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
588 atomic_inc(&ordered->refs); 615 atomic_inc(&ordered->refs);
589 spin_unlock(&root->ordered_extent_lock); 616 spin_unlock(&root->ordered_extent_lock);
590 617
591 ordered->flush_work.func = btrfs_run_ordered_extent_work; 618 btrfs_init_work(&ordered->flush_work,
619 btrfs_run_ordered_extent_work, NULL, NULL);
592 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
593 btrfs_queue_worker(&root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
594 &ordered->flush_work); 622 &ordered->flush_work);
595 623
596 cond_resched(); 624 cond_resched();
597 spin_lock(&root->ordered_extent_lock); 625 spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
608 btrfs_put_ordered_extent(ordered); 636 btrfs_put_ordered_extent(ordered);
609 cond_resched(); 637 cond_resched();
610 } 638 }
611 mutex_unlock(&root->fs_info->ordered_operations_mutex); 639 mutex_unlock(&root->ordered_extent_mutex);
612 640
613 return count; 641 return count;
614} 642}
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
621 649
622 INIT_LIST_HEAD(&splice); 650 INIT_LIST_HEAD(&splice);
623 651
652 mutex_lock(&fs_info->ordered_operations_mutex);
624 spin_lock(&fs_info->ordered_root_lock); 653 spin_lock(&fs_info->ordered_root_lock);
625 list_splice_init(&fs_info->ordered_roots, &splice); 654 list_splice_init(&fs_info->ordered_roots, &splice);
626 while (!list_empty(&splice) && nr) { 655 while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
643 } 672 }
644 list_splice_tail(&splice, &fs_info->ordered_roots); 673 list_splice_tail(&splice, &fs_info->ordered_roots);
645 spin_unlock(&fs_info->ordered_root_lock); 674 spin_unlock(&fs_info->ordered_root_lock);
675 mutex_unlock(&fs_info->ordered_operations_mutex);
646} 676}
647 677
648/* 678/*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
704 goto out; 734 goto out;
705 } 735 }
706 list_add_tail(&work->list, &works); 736 list_add_tail(&work->list, &works);
707 btrfs_queue_worker(&root->fs_info->flush_workers, 737 btrfs_queue_work(root->fs_info->flush_workers,
708 &work->work); 738 &work->work);
709 739
710 cond_resched(); 740 cond_resched();
711 spin_lock(&root->fs_info->ordered_root_lock); 741 spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
197 struct inode *inode); 197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 200void btrfs_get_logged_extents(struct inode *inode,
201 struct list_head *logged_list);
202void btrfs_put_logged_extents(struct list_head *logged_list);
203void btrfs_submit_logged_extents(struct list_head *logged_list,
204 struct btrfs_root *log);
201void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 205void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
202void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 206void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
203int __init ordered_data_init(void); 207int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1509 ret = qgroup_rescan_init(fs_info, 0, 1); 1509 ret = qgroup_rescan_init(fs_info, 0, 1);
1510 if (!ret) { 1510 if (!ret) {
1511 qgroup_rescan_zero_tracking(fs_info); 1511 qgroup_rescan_zero_tracking(fs_info);
1512 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 1512 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1513 &fs_info->qgroup_rescan_work); 1513 &fs_info->qgroup_rescan_work);
1514 } 1514 }
1515 ret = 0; 1515 ret = 0;
1516 } 1516 }
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2095 2095
2096 memset(&fs_info->qgroup_rescan_work, 0, 2096 memset(&fs_info->qgroup_rescan_work, 0,
2097 sizeof(fs_info->qgroup_rescan_work)); 2097 sizeof(fs_info->qgroup_rescan_work));
2098 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; 2098 btrfs_init_work(&fs_info->qgroup_rescan_work,
2099 btrfs_qgroup_rescan_worker, NULL, NULL);
2099 2100
2100 if (ret) { 2101 if (ret) {
2101err: 2102err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2158 2159
2159 qgroup_rescan_zero_tracking(fs_info); 2160 qgroup_rescan_zero_tracking(fs_info);
2160 2161
2161 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2162 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2162 &fs_info->qgroup_rescan_work); 2163 &fs_info->qgroup_rescan_work);
2163 2164
2164 return 0; 2165 return 0;
2165} 2166}
@@ -2190,6 +2191,6 @@ void
2190btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 2191btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2191{ 2192{
2192 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2193 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2193 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2194 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2194 &fs_info->qgroup_rescan_work); 2195 &fs_info->qgroup_rescan_work);
2195} 2196}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 rbio->work.flags = 0; 1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
1420 rbio->work.func = rmw_work;
1421 1420
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1421 btrfs_queue_work(rbio->fs_info->rmw_workers,
1423 &rbio->work); 1422 &rbio->work);
1424} 1423}
1425 1424
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1425static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{ 1426{
1428 rbio->work.flags = 0; 1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
1429 rbio->work.func = read_rebuild_work;
1430 1428
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1429 btrfs_queue_work(rbio->fs_info->rmw_workers,
1432 &rbio->work); 1430 &rbio->work);
1433} 1431}
1434 1432
1435/* 1433/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1667 plug = container_of(cb, struct btrfs_plug_cb, cb); 1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1668 1666
1669 if (from_schedule) { 1667 if (from_schedule) {
1670 plug->work.flags = 0; 1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1671 plug->work.func = unplug_work; 1669 btrfs_queue_work(plug->info->rmw_workers,
1672 btrfs_queue_worker(&plug->info->rmw_workers, 1670 &plug->work);
1673 &plug->work);
1674 return; 1671 return;
1675 } 1672 }
1676 run_plug(plug); 1673 run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
793 /* FIXME we cannot handle this properly right now */ 793 /* FIXME we cannot handle this properly right now */
794 BUG(); 794 BUG();
795 } 795 }
796 rmw->work.func = reada_start_machine_worker; 796 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
797 rmw->fs_info = fs_info; 797 rmw->fs_info = fs_info;
798 798
799 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); 799 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
800} 800}
801 801
802#ifdef DEBUG 802#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..7f92ab1daa87 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2317,7 +2317,6 @@ void free_reloc_roots(struct list_head *list)
2317static noinline_for_stack 2317static noinline_for_stack
2318int merge_reloc_roots(struct reloc_control *rc) 2318int merge_reloc_roots(struct reloc_control *rc)
2319{ 2319{
2320 struct btrfs_trans_handle *trans;
2321 struct btrfs_root *root; 2320 struct btrfs_root *root;
2322 struct btrfs_root *reloc_root; 2321 struct btrfs_root *reloc_root;
2323 u64 last_snap; 2322 u64 last_snap;
@@ -2375,26 +2374,6 @@ again:
2375 list_add_tail(&reloc_root->root_list, 2374 list_add_tail(&reloc_root->root_list,
2376 &reloc_roots); 2375 &reloc_roots);
2377 goto out; 2376 goto out;
2378 } else if (!ret) {
2379 /*
2380 * recover the last snapshot tranid to avoid
2381 * the space balance break NOCOW.
2382 */
2383 root = read_fs_root(rc->extent_root->fs_info,
2384 objectid);
2385 if (IS_ERR(root))
2386 continue;
2387
2388 trans = btrfs_join_transaction(root);
2389 BUG_ON(IS_ERR(trans));
2390
2391 /* Check if the fs/file tree was snapshoted or not. */
2392 if (btrfs_root_last_snapshot(&root->root_item) ==
2393 otransid - 1)
2394 btrfs_set_root_last_snapshot(&root->root_item,
2395 last_snap);
2396
2397 btrfs_end_transaction(trans, root);
2398 } 2377 }
2399 } 2378 }
2400 2379
@@ -4248,7 +4227,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4227 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4249 rc->block_group->key.objectid, rc->block_group->flags); 4228 rc->block_group->key.objectid, rc->block_group->flags);
4250 4229
4251 ret = btrfs_start_delalloc_roots(fs_info, 0); 4230 ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4252 if (ret < 0) { 4231 if (ret < 0) {
4253 err = ret; 4232 err = ret;
4254 goto out; 4233 goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/err.h>
19#include <linux/uuid.h> 20#include <linux/uuid.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
271 key.offset++; 272 key.offset++;
272 273
273 root = btrfs_read_fs_root(tree_root, &root_key); 274 root = btrfs_read_fs_root(tree_root, &root_key);
274 err = PTR_RET(root); 275 err = PTR_ERR_OR_ZERO(root);
275 if (err && err != -ENOENT) { 276 if (err && err != -ENOENT) {
276 break; 277 break;
277 } else if (err == -ENOENT) { 278 } else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..0be77993378e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
315 atomic_inc(&fs_info->scrubs_running); 315 atomic_inc(&fs_info->scrubs_running);
316 atomic_inc(&fs_info->scrubs_paused); 316 atomic_inc(&fs_info->scrubs_paused);
317 mutex_unlock(&fs_info->scrub_lock); 317 mutex_unlock(&fs_info->scrub_lock);
318
319 /*
320 * check if @scrubs_running=@scrubs_paused condition
321 * inside wait_event() is not an atomic operation.
322 * which means we may inc/dec @scrub_running/paused
323 * at any time. Let's wake up @scrub_pause_wait as
324 * much as we can to let commit transaction blocked less.
325 */
326 wake_up(&fs_info->scrub_pause_wait);
327
318 atomic_inc(&sctx->workers_pending); 328 atomic_inc(&sctx->workers_pending);
319} 329}
320 330
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
418 sbio->index = i; 428 sbio->index = i;
419 sbio->sctx = sctx; 429 sbio->sctx = sctx;
420 sbio->page_count = 0; 430 sbio->page_count = 0;
421 sbio->work.func = scrub_bio_end_io_worker; 431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
432 NULL, NULL);
422 433
423 if (i != SCRUB_BIOS_PER_SCTX - 1) 434 if (i != SCRUB_BIOS_PER_SCTX - 1)
424 sctx->bios[i]->next_free = i + 1; 435 sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
987 fixup_nodatasum->root = fs_info->extent_root; 998 fixup_nodatasum->root = fs_info->extent_root;
988 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 999 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
989 scrub_pending_trans_workers_inc(sctx); 1000 scrub_pending_trans_workers_inc(sctx);
990 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 1001 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
991 btrfs_queue_worker(&fs_info->scrub_workers, 1002 NULL, NULL);
992 &fixup_nodatasum->work); 1003 btrfs_queue_work(fs_info->scrub_workers,
1004 &fixup_nodatasum->work);
993 goto out; 1005 goto out;
994 } 1006 }
995 1007
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1603 sbio->err = err; 1615 sbio->err = err;
1604 sbio->bio = bio; 1616 sbio->bio = bio;
1605 1617
1606 sbio->work.func = scrub_wr_bio_end_io_worker; 1618 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1607 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1619 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1608} 1620}
1609 1621
1610static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1622static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
2072 sbio->err = err; 2084 sbio->err = err;
2073 sbio->bio = bio; 2085 sbio->bio = bio;
2074 2086
2075 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 2087 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2076} 2088}
2077 2089
2078static void scrub_bio_end_io_worker(struct btrfs_work *work) 2090static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2223,6 +2235,47 @@ behind_scrub_pages:
2223 return 0; 2235 return 0;
2224} 2236}
2225 2237
2238/*
2239 * Given a physical address, this will calculate it's
2240 * logical offset. if this is a parity stripe, it will return
2241 * the most left data stripe's logical offset.
2242 *
2243 * return 0 if it is a data stripe, 1 means parity stripe.
2244 */
2245static int get_raid56_logic_offset(u64 physical, int num,
2246 struct map_lookup *map, u64 *offset)
2247{
2248 int i;
2249 int j = 0;
2250 u64 stripe_nr;
2251 u64 last_offset;
2252 int stripe_index;
2253 int rot;
2254
2255 last_offset = (physical - map->stripes[num].physical) *
2256 nr_data_stripes(map);
2257 *offset = last_offset;
2258 for (i = 0; i < nr_data_stripes(map); i++) {
2259 *offset = last_offset + i * map->stripe_len;
2260
2261 stripe_nr = *offset;
2262 do_div(stripe_nr, map->stripe_len);
2263 do_div(stripe_nr, nr_data_stripes(map));
2264
2265 /* Work out the disk rotation on this stripe-set */
2266 rot = do_div(stripe_nr, map->num_stripes);
2267 /* calculate which stripe this data locates */
2268 rot += i;
2269 stripe_index = rot % map->num_stripes;
2270 if (stripe_index == num)
2271 return 0;
2272 if (stripe_index < num)
2273 j++;
2274 }
2275 *offset = last_offset + j * map->stripe_len;
2276 return 1;
2277}
2278
2226static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2279static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2227 struct map_lookup *map, 2280 struct map_lookup *map,
2228 struct btrfs_device *scrub_dev, 2281 struct btrfs_device *scrub_dev,
@@ -2244,6 +2297,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2244 u64 physical; 2297 u64 physical;
2245 u64 logical; 2298 u64 logical;
2246 u64 logic_end; 2299 u64 logic_end;
2300 u64 physical_end;
2247 u64 generation; 2301 u64 generation;
2248 int mirror_num; 2302 int mirror_num;
2249 struct reada_control *reada1; 2303 struct reada_control *reada1;
@@ -2257,16 +2311,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2257 u64 extent_len; 2311 u64 extent_len;
2258 struct btrfs_device *extent_dev; 2312 struct btrfs_device *extent_dev;
2259 int extent_mirror_num; 2313 int extent_mirror_num;
2260 int stop_loop; 2314 int stop_loop = 0;
2261
2262 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2263 BTRFS_BLOCK_GROUP_RAID6)) {
2264 if (num >= nr_data_stripes(map)) {
2265 return 0;
2266 }
2267 }
2268 2315
2269 nstripes = length; 2316 nstripes = length;
2317 physical = map->stripes[num].physical;
2270 offset = 0; 2318 offset = 0;
2271 do_div(nstripes, map->stripe_len); 2319 do_div(nstripes, map->stripe_len);
2272 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2320 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
@@ -2284,6 +2332,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2284 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2332 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2285 increment = map->stripe_len; 2333 increment = map->stripe_len;
2286 mirror_num = num % map->num_stripes + 1; 2334 mirror_num = num % map->num_stripes + 1;
2335 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2336 BTRFS_BLOCK_GROUP_RAID6)) {
2337 get_raid56_logic_offset(physical, num, map, &offset);
2338 increment = map->stripe_len * nr_data_stripes(map);
2339 mirror_num = 1;
2287 } else { 2340 } else {
2288 increment = map->stripe_len; 2341 increment = map->stripe_len;
2289 mirror_num = 1; 2342 mirror_num = 1;
@@ -2307,7 +2360,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2307 * to not hold off transaction commits 2360 * to not hold off transaction commits
2308 */ 2361 */
2309 logical = base + offset; 2362 logical = base + offset;
2310 2363 physical_end = physical + nstripes * map->stripe_len;
2364 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2365 BTRFS_BLOCK_GROUP_RAID6)) {
2366 get_raid56_logic_offset(physical_end, num,
2367 map, &logic_end);
2368 logic_end += base;
2369 } else {
2370 logic_end = logical + increment * nstripes;
2371 }
2311 wait_event(sctx->list_wait, 2372 wait_event(sctx->list_wait,
2312 atomic_read(&sctx->bios_in_flight) == 0); 2373 atomic_read(&sctx->bios_in_flight) == 0);
2313 scrub_blocked_if_needed(fs_info); 2374 scrub_blocked_if_needed(fs_info);
@@ -2316,7 +2377,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2316 key_start.objectid = logical; 2377 key_start.objectid = logical;
2317 key_start.type = BTRFS_EXTENT_ITEM_KEY; 2378 key_start.type = BTRFS_EXTENT_ITEM_KEY;
2318 key_start.offset = (u64)0; 2379 key_start.offset = (u64)0;
2319 key_end.objectid = base + offset + nstripes * increment; 2380 key_end.objectid = logic_end;
2320 key_end.type = BTRFS_METADATA_ITEM_KEY; 2381 key_end.type = BTRFS_METADATA_ITEM_KEY;
2321 key_end.offset = (u64)-1; 2382 key_end.offset = (u64)-1;
2322 reada1 = btrfs_reada_add(root, &key_start, &key_end); 2383 reada1 = btrfs_reada_add(root, &key_start, &key_end);
@@ -2326,7 +2387,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2326 key_start.offset = logical; 2387 key_start.offset = logical;
2327 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 2388 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2328 key_end.type = BTRFS_EXTENT_CSUM_KEY; 2389 key_end.type = BTRFS_EXTENT_CSUM_KEY;
2329 key_end.offset = base + offset + nstripes * increment; 2390 key_end.offset = logic_end;
2330 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 2391 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2331 2392
2332 if (!IS_ERR(reada1)) 2393 if (!IS_ERR(reada1))
@@ -2344,11 +2405,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2344 /* 2405 /*
2345 * now find all extents for each stripe and scrub them 2406 * now find all extents for each stripe and scrub them
2346 */ 2407 */
2347 logical = base + offset;
2348 physical = map->stripes[num].physical;
2349 logic_end = logical + increment * nstripes;
2350 ret = 0; 2408 ret = 0;
2351 while (logical < logic_end) { 2409 while (physical < physical_end) {
2410 /* for raid56, we skip parity stripe */
2411 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2412 BTRFS_BLOCK_GROUP_RAID6)) {
2413 ret = get_raid56_logic_offset(physical, num,
2414 map, &logical);
2415 logical += base;
2416 if (ret)
2417 goto skip;
2418 }
2352 /* 2419 /*
2353 * canceled? 2420 * canceled?
2354 */ 2421 */
@@ -2492,15 +2559,29 @@ again:
2492 scrub_free_csums(sctx); 2559 scrub_free_csums(sctx);
2493 if (extent_logical + extent_len < 2560 if (extent_logical + extent_len <
2494 key.objectid + bytes) { 2561 key.objectid + bytes) {
2495 logical += increment; 2562 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2496 physical += map->stripe_len; 2563 BTRFS_BLOCK_GROUP_RAID6)) {
2497 2564 /*
2565 * loop until we find next data stripe
2566 * or we have finished all stripes.
2567 */
2568 do {
2569 physical += map->stripe_len;
2570 ret = get_raid56_logic_offset(
2571 physical, num,
2572 map, &logical);
2573 logical += base;
2574 } while (physical < physical_end && ret);
2575 } else {
2576 physical += map->stripe_len;
2577 logical += increment;
2578 }
2498 if (logical < key.objectid + bytes) { 2579 if (logical < key.objectid + bytes) {
2499 cond_resched(); 2580 cond_resched();
2500 goto again; 2581 goto again;
2501 } 2582 }
2502 2583
2503 if (logical >= logic_end) { 2584 if (physical >= physical_end) {
2504 stop_loop = 1; 2585 stop_loop = 1;
2505 break; 2586 break;
2506 } 2587 }
@@ -2509,6 +2590,7 @@ next:
2509 path->slots[0]++; 2590 path->slots[0]++;
2510 } 2591 }
2511 btrfs_release_path(path); 2592 btrfs_release_path(path);
2593skip:
2512 logical += increment; 2594 logical += increment;
2513 physical += map->stripe_len; 2595 physical += map->stripe_len;
2514 spin_lock(&sctx->stat_lock); 2596 spin_lock(&sctx->stat_lock);
@@ -2686,10 +2768,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2686 2768
2687 wait_event(sctx->list_wait, 2769 wait_event(sctx->list_wait,
2688 atomic_read(&sctx->bios_in_flight) == 0); 2770 atomic_read(&sctx->bios_in_flight) == 0);
2689 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2771 atomic_inc(&fs_info->scrubs_paused);
2772 wake_up(&fs_info->scrub_pause_wait);
2773
2774 /*
2775 * must be called before we decrease @scrub_paused.
2776 * make sure we don't block transaction commit while
2777 * we are waiting pending workers finished.
2778 */
2690 wait_event(sctx->list_wait, 2779 wait_event(sctx->list_wait,
2691 atomic_read(&sctx->workers_pending) == 0); 2780 atomic_read(&sctx->workers_pending) == 0);
2692 scrub_blocked_if_needed(fs_info); 2781 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2782
2783 mutex_lock(&fs_info->scrub_lock);
2784 __scrub_blocked_if_needed(fs_info);
2785 atomic_dec(&fs_info->scrubs_paused);
2786 mutex_unlock(&fs_info->scrub_lock);
2787 wake_up(&fs_info->scrub_pause_wait);
2693 2788
2694 btrfs_put_block_group(cache); 2789 btrfs_put_block_group(cache);
2695 if (ret) 2790 if (ret)
@@ -2757,33 +2852,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2757 int is_dev_replace) 2852 int is_dev_replace)
2758{ 2853{
2759 int ret = 0; 2854 int ret = 0;
2855 int flags = WQ_FREEZABLE | WQ_UNBOUND;
2856 int max_active = fs_info->thread_pool_size;
2760 2857
2761 if (fs_info->scrub_workers_refcnt == 0) { 2858 if (fs_info->scrub_workers_refcnt == 0) {
2762 if (is_dev_replace) 2859 if (is_dev_replace)
2763 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2860 fs_info->scrub_workers =
2764 &fs_info->generic_worker); 2861 btrfs_alloc_workqueue("btrfs-scrub", flags,
2862 1, 4);
2765 else 2863 else
2766 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2864 fs_info->scrub_workers =
2767 fs_info->thread_pool_size, 2865 btrfs_alloc_workqueue("btrfs-scrub", flags,
2768 &fs_info->generic_worker); 2866 max_active, 4);
2769 fs_info->scrub_workers.idle_thresh = 4; 2867 if (!fs_info->scrub_workers) {
2770 ret = btrfs_start_workers(&fs_info->scrub_workers); 2868 ret = -ENOMEM;
2771 if (ret)
2772 goto out; 2869 goto out;
2773 btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2870 }
2774 "scrubwrc", 2871 fs_info->scrub_wr_completion_workers =
2775 fs_info->thread_pool_size, 2872 btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
2776 &fs_info->generic_worker); 2873 max_active, 2);
2777 fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2874 if (!fs_info->scrub_wr_completion_workers) {
2778 ret = btrfs_start_workers( 2875 ret = -ENOMEM;
2779 &fs_info->scrub_wr_completion_workers);
2780 if (ret)
2781 goto out; 2876 goto out;
2782 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2877 }
2783 &fs_info->generic_worker); 2878 fs_info->scrub_nocow_workers =
2784 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2879 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
2785 if (ret) 2880 if (!fs_info->scrub_nocow_workers) {
2881 ret = -ENOMEM;
2786 goto out; 2882 goto out;
2883 }
2787 } 2884 }
2788 ++fs_info->scrub_workers_refcnt; 2885 ++fs_info->scrub_workers_refcnt;
2789out: 2886out:
@@ -2793,9 +2890,9 @@ out:
2793static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2890static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2794{ 2891{
2795 if (--fs_info->scrub_workers_refcnt == 0) { 2892 if (--fs_info->scrub_workers_refcnt == 0) {
2796 btrfs_stop_workers(&fs_info->scrub_workers); 2893 btrfs_destroy_workqueue(fs_info->scrub_workers);
2797 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2894 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
2798 btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2895 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
2799 } 2896 }
2800 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2897 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2801} 2898}
@@ -3106,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3106 nocow_ctx->len = len; 3203 nocow_ctx->len = len;
3107 nocow_ctx->mirror_num = mirror_num; 3204 nocow_ctx->mirror_num = mirror_num;
3108 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3205 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3109 nocow_ctx->work.func = copy_nocow_pages_worker; 3206 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
3110 INIT_LIST_HEAD(&nocow_ctx->inodes); 3207 INIT_LIST_HEAD(&nocow_ctx->inodes);
3111 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3208 btrfs_queue_work(fs_info->scrub_nocow_workers,
3112 &nocow_ctx->work); 3209 &nocow_ctx->work);
3113 3210
3114 return 0; 3211 return 0;
3115} 3212}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..1ac3ca98c429 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
51 struct { 51 struct {
52 char *start; 52 char *start;
53 char *end; 53 char *end;
54 char *prepared;
55 54
56 char *buf; 55 char *buf;
57 int buf_len; 56 unsigned short buf_len:15;
58 unsigned int reversed:1; 57 unsigned short reversed:1;
59 unsigned int virtual_mem:1;
60 char inline_buf[]; 58 char inline_buf[];
61 }; 59 };
62 char pad[PAGE_SIZE]; 60 /*
61 * Average path length does not exceed 200 bytes, we'll have
62 * better packing in the slab and higher chance to satisfy
63 * a allocation later during send.
64 */
65 char pad[256];
63 }; 66 };
64}; 67};
65#define FS_PATH_INLINE_SIZE \ 68#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
109 int cur_inode_deleted; 112 int cur_inode_deleted;
110 u64 cur_inode_size; 113 u64 cur_inode_size;
111 u64 cur_inode_mode; 114 u64 cur_inode_mode;
115 u64 cur_inode_rdev;
112 u64 cur_inode_last_extent; 116 u64 cur_inode_last_extent;
113 117
114 u64 send_progress; 118 u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
120 struct list_head name_cache_list; 124 struct list_head name_cache_list;
121 int name_cache_size; 125 int name_cache_size;
122 126
127 struct file_ra_state ra;
128
123 char *read_buf; 129 char *read_buf;
124 130
125 /* 131 /*
@@ -175,6 +181,47 @@ struct send_ctx {
175 * own move/rename can be performed. 181 * own move/rename can be performed.
176 */ 182 */
177 struct rb_root waiting_dir_moves; 183 struct rb_root waiting_dir_moves;
184
185 /*
186 * A directory that is going to be rm'ed might have a child directory
187 * which is in the pending directory moves index above. In this case,
188 * the directory can only be removed after the move/rename of its child
189 * is performed. Example:
190 *
191 * Parent snapshot:
192 *
193 * . (ino 256)
194 * |-- a/ (ino 257)
195 * |-- b/ (ino 258)
196 * |-- c/ (ino 259)
197 * | |-- x/ (ino 260)
198 * |
199 * |-- y/ (ino 261)
200 *
201 * Send snapshot:
202 *
203 * . (ino 256)
204 * |-- a/ (ino 257)
205 * |-- b/ (ino 258)
206 * |-- YY/ (ino 261)
207 * |-- x/ (ino 260)
208 *
209 * Sequence of steps that lead to the send snapshot:
210 * rm -f /a/b/c/foo.txt
211 * mv /a/b/y /a/b/YY
212 * mv /a/b/c/x /a/b/YY
213 * rmdir /a/b/c
214 *
215 * When the child is processed, its move/rename is delayed until its
216 * parent is processed (as explained above), but all other operations
217 * like update utimes, chown, chgrp, etc, are performed and the paths
218 * that it uses for those operations must use the orphanized name of
219 * its parent (the directory we're going to rm later), so we need to
220 * memorize that name.
221 *
222 * Indexed by the inode number of the directory to be deleted.
223 */
224 struct rb_root orphan_dirs;
178}; 225};
179 226
180struct pending_dir_move { 227struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
189struct waiting_dir_move { 236struct waiting_dir_move {
190 struct rb_node node; 237 struct rb_node node;
191 u64 ino; 238 u64 ino;
239 /*
240 * There might be some directory that could not be removed because it
241 * was waiting for this directory inode to be moved first. Therefore
242 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
243 */
244 u64 rmdir_ino;
245};
246
247struct orphan_dir_info {
248 struct rb_node node;
249 u64 ino;
250 u64 gen;
192}; 251};
193 252
194struct name_cache_entry { 253struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
214 273
215static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 274static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
216 275
276static struct waiting_dir_move *
277get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
278
279static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
280
217static int need_send_hole(struct send_ctx *sctx) 281static int need_send_hole(struct send_ctx *sctx)
218{ 282{
219 return (sctx->parent_root && !sctx->cur_inode_new && 283 return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
242 if (!p) 306 if (!p)
243 return NULL; 307 return NULL;
244 p->reversed = 0; 308 p->reversed = 0;
245 p->virtual_mem = 0;
246 p->buf = p->inline_buf; 309 p->buf = p->inline_buf;
247 p->buf_len = FS_PATH_INLINE_SIZE; 310 p->buf_len = FS_PATH_INLINE_SIZE;
248 fs_path_reset(p); 311 fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
265{ 328{
266 if (!p) 329 if (!p)
267 return; 330 return;
268 if (p->buf != p->inline_buf) { 331 if (p->buf != p->inline_buf)
269 if (p->virtual_mem) 332 kfree(p->buf);
270 vfree(p->buf);
271 else
272 kfree(p->buf);
273 }
274 kfree(p); 333 kfree(p);
275} 334}
276 335
@@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
292 351
293 path_len = p->end - p->start; 352 path_len = p->end - p->start;
294 old_buf_len = p->buf_len; 353 old_buf_len = p->buf_len;
295 len = PAGE_ALIGN(len); 354
296 355 /*
297 if (p->buf == p->inline_buf) { 356 * First time the inline_buf does not suffice
298 tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); 357 */
299 if (!tmp_buf) { 358 if (p->buf == p->inline_buf)
300 tmp_buf = vmalloc(len); 359 tmp_buf = kmalloc(len, GFP_NOFS);
301 if (!tmp_buf) 360 else
302 return -ENOMEM; 361 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
303 p->virtual_mem = 1; 362 if (!tmp_buf)
304 } 363 return -ENOMEM;
305 memcpy(tmp_buf, p->buf, p->buf_len); 364 p->buf = tmp_buf;
306 p->buf = tmp_buf; 365 /*
307 p->buf_len = len; 366 * The real size of the buffer is bigger, this will let the fast path
308 } else { 367 * happen most of the time
309 if (p->virtual_mem) { 368 */
310 tmp_buf = vmalloc(len); 369 p->buf_len = ksize(p->buf);
311 if (!tmp_buf) 370
312 return -ENOMEM;
313 memcpy(tmp_buf, p->buf, p->buf_len);
314 vfree(p->buf);
315 } else {
316 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
317 if (!tmp_buf) {
318 tmp_buf = vmalloc(len);
319 if (!tmp_buf)
320 return -ENOMEM;
321 memcpy(tmp_buf, p->buf, p->buf_len);
322 kfree(p->buf);
323 p->virtual_mem = 1;
324 }
325 }
326 p->buf = tmp_buf;
327 p->buf_len = len;
328 }
329 if (p->reversed) { 371 if (p->reversed) {
330 tmp_buf = p->buf + old_buf_len - path_len - 1; 372 tmp_buf = p->buf + old_buf_len - path_len - 1;
331 p->end = p->buf + p->buf_len - 1; 373 p->end = p->buf + p->buf_len - 1;
@@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
338 return 0; 380 return 0;
339} 381}
340 382
341static int fs_path_prepare_for_add(struct fs_path *p, int name_len) 383static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
384 char **prepared)
342{ 385{
343 int ret; 386 int ret;
344 int new_len; 387 int new_len;
@@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
354 if (p->start != p->end) 397 if (p->start != p->end)
355 *--p->start = '/'; 398 *--p->start = '/';
356 p->start -= name_len; 399 p->start -= name_len;
357 p->prepared = p->start; 400 *prepared = p->start;
358 } else { 401 } else {
359 if (p->start != p->end) 402 if (p->start != p->end)
360 *p->end++ = '/'; 403 *p->end++ = '/';
361 p->prepared = p->end; 404 *prepared = p->end;
362 p->end += name_len; 405 p->end += name_len;
363 *p->end = 0; 406 *p->end = 0;
364 } 407 }
@@ -370,12 +413,12 @@ out:
370static int fs_path_add(struct fs_path *p, const char *name, int name_len) 413static int fs_path_add(struct fs_path *p, const char *name, int name_len)
371{ 414{
372 int ret; 415 int ret;
416 char *prepared;
373 417
374 ret = fs_path_prepare_for_add(p, name_len); 418 ret = fs_path_prepare_for_add(p, name_len, &prepared);
375 if (ret < 0) 419 if (ret < 0)
376 goto out; 420 goto out;
377 memcpy(p->prepared, name, name_len); 421 memcpy(prepared, name, name_len);
378 p->prepared = NULL;
379 422
380out: 423out:
381 return ret; 424 return ret;
@@ -384,12 +427,12 @@ out:
384static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 427static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
385{ 428{
386 int ret; 429 int ret;
430 char *prepared;
387 431
388 ret = fs_path_prepare_for_add(p, p2->end - p2->start); 432 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
389 if (ret < 0) 433 if (ret < 0)
390 goto out; 434 goto out;
391 memcpy(p->prepared, p2->start, p2->end - p2->start); 435 memcpy(prepared, p2->start, p2->end - p2->start);
392 p->prepared = NULL;
393 436
394out: 437out:
395 return ret; 438 return ret;
@@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
400 unsigned long off, int len) 443 unsigned long off, int len)
401{ 444{
402 int ret; 445 int ret;
446 char *prepared;
403 447
404 ret = fs_path_prepare_for_add(p, len); 448 ret = fs_path_prepare_for_add(p, len, &prepared);
405 if (ret < 0) 449 if (ret < 0)
406 goto out; 450 goto out;
407 451
408 read_extent_buffer(eb, p->prepared, off, len); 452 read_extent_buffer(eb, prepared, off, len);
409 p->prepared = NULL;
410 453
411out: 454out:
412 return ret; 455 return ret;
@@ -450,6 +493,7 @@ static struct btrfs_path *alloc_path_for_send(void)
450 return NULL; 493 return NULL;
451 path->search_commit_root = 1; 494 path->search_commit_root = 1;
452 path->skip_locking = 1; 495 path->skip_locking = 1;
496 path->need_commit_sem = 1;
453 return path; 497 return path;
454} 498}
455 499
@@ -728,29 +772,22 @@ out:
728/* 772/*
729 * Helper function to retrieve some fields from an inode item. 773 * Helper function to retrieve some fields from an inode item.
730 */ 774 */
731static int get_inode_info(struct btrfs_root *root, 775static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
732 u64 ino, u64 *size, u64 *gen, 776 u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
733 u64 *mode, u64 *uid, u64 *gid, 777 u64 *gid, u64 *rdev)
734 u64 *rdev)
735{ 778{
736 int ret; 779 int ret;
737 struct btrfs_inode_item *ii; 780 struct btrfs_inode_item *ii;
738 struct btrfs_key key; 781 struct btrfs_key key;
739 struct btrfs_path *path;
740
741 path = alloc_path_for_send();
742 if (!path)
743 return -ENOMEM;
744 782
745 key.objectid = ino; 783 key.objectid = ino;
746 key.type = BTRFS_INODE_ITEM_KEY; 784 key.type = BTRFS_INODE_ITEM_KEY;
747 key.offset = 0; 785 key.offset = 0;
748 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 786 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
749 if (ret < 0)
750 goto out;
751 if (ret) { 787 if (ret) {
752 ret = -ENOENT; 788 if (ret > 0)
753 goto out; 789 ret = -ENOENT;
790 return ret;
754 } 791 }
755 792
756 ii = btrfs_item_ptr(path->nodes[0], path->slots[0], 793 ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -768,7 +805,22 @@ static int get_inode_info(struct btrfs_root *root,
768 if (rdev) 805 if (rdev)
769 *rdev = btrfs_inode_rdev(path->nodes[0], ii); 806 *rdev = btrfs_inode_rdev(path->nodes[0], ii);
770 807
771out: 808 return ret;
809}
810
811static int get_inode_info(struct btrfs_root *root,
812 u64 ino, u64 *size, u64 *gen,
813 u64 *mode, u64 *uid, u64 *gid,
814 u64 *rdev)
815{
816 struct btrfs_path *path;
817 int ret;
818
819 path = alloc_path_for_send();
820 if (!path)
821 return -ENOMEM;
822 ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
823 rdev);
772 btrfs_free_path(path); 824 btrfs_free_path(path);
773 return ret; 825 return ret;
774} 826}
@@ -915,9 +967,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
915 struct btrfs_dir_item *di; 967 struct btrfs_dir_item *di;
916 struct btrfs_key di_key; 968 struct btrfs_key di_key;
917 char *buf = NULL; 969 char *buf = NULL;
918 char *buf2 = NULL; 970 const int buf_len = PATH_MAX;
919 int buf_len;
920 int buf_virtual = 0;
921 u32 name_len; 971 u32 name_len;
922 u32 data_len; 972 u32 data_len;
923 u32 cur; 973 u32 cur;
@@ -927,7 +977,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
927 int num; 977 int num;
928 u8 type; 978 u8 type;
929 979
930 buf_len = PAGE_SIZE;
931 buf = kmalloc(buf_len, GFP_NOFS); 980 buf = kmalloc(buf_len, GFP_NOFS);
932 if (!buf) { 981 if (!buf) {
933 ret = -ENOMEM; 982 ret = -ENOMEM;
@@ -949,30 +998,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
949 type = btrfs_dir_type(eb, di); 998 type = btrfs_dir_type(eb, di);
950 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 999 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
951 1000
1001 /*
1002 * Path too long
1003 */
952 if (name_len + data_len > buf_len) { 1004 if (name_len + data_len > buf_len) {
953 buf_len = PAGE_ALIGN(name_len + data_len); 1005 ret = -ENAMETOOLONG;
954 if (buf_virtual) { 1006 goto out;
955 buf2 = vmalloc(buf_len);
956 if (!buf2) {
957 ret = -ENOMEM;
958 goto out;
959 }
960 vfree(buf);
961 } else {
962 buf2 = krealloc(buf, buf_len, GFP_NOFS);
963 if (!buf2) {
964 buf2 = vmalloc(buf_len);
965 if (!buf2) {
966 ret = -ENOMEM;
967 goto out;
968 }
969 kfree(buf);
970 buf_virtual = 1;
971 }
972 }
973
974 buf = buf2;
975 buf2 = NULL;
976 } 1007 }
977 1008
978 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1009 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1026,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 } 1026 }
996 1027
997out: 1028out:
998 if (buf_virtual) 1029 kfree(buf);
999 vfree(buf);
1000 else
1001 kfree(buf);
1002 return ret; 1030 return ret;
1003} 1031}
1004 1032
@@ -1066,6 +1094,7 @@ out:
1066struct backref_ctx { 1094struct backref_ctx {
1067 struct send_ctx *sctx; 1095 struct send_ctx *sctx;
1068 1096
1097 struct btrfs_path *path;
1069 /* number of total found references */ 1098 /* number of total found references */
1070 u64 found; 1099 u64 found;
1071 1100
@@ -1136,8 +1165,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1136 * There are inodes that have extents that lie behind its i_size. Don't 1165 * There are inodes that have extents that lie behind its i_size. Don't
1137 * accept clones from these extents. 1166 * accept clones from these extents.
1138 */ 1167 */
1139 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, 1168 ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
1140 NULL); 1169 NULL, NULL, NULL);
1170 btrfs_release_path(bctx->path);
1141 if (ret < 0) 1171 if (ret < 0)
1142 return ret; 1172 return ret;
1143 1173
@@ -1216,12 +1246,17 @@ static int find_extent_clone(struct send_ctx *sctx,
1216 if (!tmp_path) 1246 if (!tmp_path)
1217 return -ENOMEM; 1247 return -ENOMEM;
1218 1248
1249 /* We only use this path under the commit sem */
1250 tmp_path->need_commit_sem = 0;
1251
1219 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); 1252 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
1220 if (!backref_ctx) { 1253 if (!backref_ctx) {
1221 ret = -ENOMEM; 1254 ret = -ENOMEM;
1222 goto out; 1255 goto out;
1223 } 1256 }
1224 1257
1258 backref_ctx->path = tmp_path;
1259
1225 if (data_offset >= ino_size) { 1260 if (data_offset >= ino_size) {
1226 /* 1261 /*
1227 * There may be extents that lie behind the file's size. 1262 * There may be extents that lie behind the file's size.
@@ -1249,8 +1284,10 @@ static int find_extent_clone(struct send_ctx *sctx,
1249 } 1284 }
1250 logical = disk_byte + btrfs_file_extent_offset(eb, fi); 1285 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1251 1286
1287 down_read(&sctx->send_root->fs_info->commit_root_sem);
1252 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, 1288 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
1253 &found_key, &flags); 1289 &found_key, &flags);
1290 up_read(&sctx->send_root->fs_info->commit_root_sem);
1254 btrfs_release_path(tmp_path); 1291 btrfs_release_path(tmp_path);
1255 1292
1256 if (ret < 0) 1293 if (ret < 0)
@@ -1292,8 +1329,6 @@ static int find_extent_clone(struct send_ctx *sctx,
1292 extent_item_pos = logical - found_key.objectid; 1329 extent_item_pos = logical - found_key.objectid;
1293 else 1330 else
1294 extent_item_pos = 0; 1331 extent_item_pos = 0;
1295
1296 extent_item_pos = logical - found_key.objectid;
1297 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1332 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1298 found_key.objectid, extent_item_pos, 1, 1333 found_key.objectid, extent_item_pos, 1,
1299 __iterate_backrefs, backref_ctx); 1334 __iterate_backrefs, backref_ctx);
@@ -1418,11 +1453,7 @@ static int gen_unique_name(struct send_ctx *sctx,
1418 while (1) { 1453 while (1) {
1419 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1454 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1420 ino, gen, idx); 1455 ino, gen, idx);
1421 if (len >= sizeof(tmp)) { 1456 ASSERT(len < sizeof(tmp));
1422 /* should really not happen */
1423 ret = -EOVERFLOW;
1424 goto out;
1425 }
1426 1457
1427 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1458 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1428 path, BTRFS_FIRST_FREE_OBJECTID, 1459 path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1898,13 +1929,20 @@ static void name_cache_delete(struct send_ctx *sctx,
1898 1929
1899 nce_head = radix_tree_lookup(&sctx->name_cache, 1930 nce_head = radix_tree_lookup(&sctx->name_cache,
1900 (unsigned long)nce->ino); 1931 (unsigned long)nce->ino);
1901 BUG_ON(!nce_head); 1932 if (!nce_head) {
1933 btrfs_err(sctx->send_root->fs_info,
1934 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
1935 nce->ino, sctx->name_cache_size);
1936 }
1902 1937
1903 list_del(&nce->radix_list); 1938 list_del(&nce->radix_list);
1904 list_del(&nce->list); 1939 list_del(&nce->list);
1905 sctx->name_cache_size--; 1940 sctx->name_cache_size--;
1906 1941
1907 if (list_empty(nce_head)) { 1942 /*
1943 * We may not get to the final release of nce_head if the lookup fails
1944 */
1945 if (nce_head && list_empty(nce_head)) {
1908 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 1946 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1909 kfree(nce_head); 1947 kfree(nce_head);
1910 } 1948 }
@@ -1977,7 +2015,6 @@ static void name_cache_free(struct send_ctx *sctx)
1977 */ 2015 */
1978static int __get_cur_name_and_parent(struct send_ctx *sctx, 2016static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 u64 ino, u64 gen, 2017 u64 ino, u64 gen,
1980 int skip_name_cache,
1981 u64 *parent_ino, 2018 u64 *parent_ino,
1982 u64 *parent_gen, 2019 u64 *parent_gen,
1983 struct fs_path *dest) 2020 struct fs_path *dest)
@@ -1987,8 +2024,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1987 struct btrfs_path *path = NULL; 2024 struct btrfs_path *path = NULL;
1988 struct name_cache_entry *nce = NULL; 2025 struct name_cache_entry *nce = NULL;
1989 2026
1990 if (skip_name_cache)
1991 goto get_ref;
1992 /* 2027 /*
1993 * First check if we already did a call to this function with the same 2028 * First check if we already did a call to this function with the same
1994 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2029 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2068,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2033 goto out_cache; 2068 goto out_cache;
2034 } 2069 }
2035 2070
2036get_ref:
2037 /* 2071 /*
2038 * Depending on whether the inode was already processed or not, use 2072 * Depending on whether the inode was already processed or not, use
2039 * send_root or parent_root for ref lookup. 2073 * send_root or parent_root for ref lookup.
2040 */ 2074 */
2041 if (ino < sctx->send_progress && !skip_name_cache) 2075 if (ino < sctx->send_progress)
2042 ret = get_first_ref(sctx->send_root, ino, 2076 ret = get_first_ref(sctx->send_root, ino,
2043 parent_ino, parent_gen, dest); 2077 parent_ino, parent_gen, dest);
2044 else 2078 else
@@ -2062,8 +2096,6 @@ get_ref:
2062 goto out; 2096 goto out;
2063 ret = 1; 2097 ret = 1;
2064 } 2098 }
2065 if (skip_name_cache)
2066 goto out;
2067 2099
2068out_cache: 2100out_cache:
2069 /* 2101 /*
@@ -2131,9 +2163,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2131 u64 parent_inode = 0; 2163 u64 parent_inode = 0;
2132 u64 parent_gen = 0; 2164 u64 parent_gen = 0;
2133 int stop = 0; 2165 int stop = 0;
2134 u64 start_ino = ino;
2135 u64 start_gen = gen;
2136 int skip_name_cache = 0;
2137 2166
2138 name = fs_path_alloc(); 2167 name = fs_path_alloc();
2139 if (!name) { 2168 if (!name) {
@@ -2141,31 +2170,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2141 goto out; 2170 goto out;
2142 } 2171 }
2143 2172
2144 if (is_waiting_for_move(sctx, ino))
2145 skip_name_cache = 1;
2146
2147again:
2148 dest->reversed = 1; 2173 dest->reversed = 1;
2149 fs_path_reset(dest); 2174 fs_path_reset(dest);
2150 2175
2151 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2176 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2152 fs_path_reset(name); 2177 fs_path_reset(name);
2153 2178
2154 ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, 2179 if (is_waiting_for_rm(sctx, ino)) {
2155 &parent_inode, &parent_gen, name); 2180 ret = gen_unique_name(sctx, ino, gen, name);
2181 if (ret < 0)
2182 goto out;
2183 ret = fs_path_add_path(dest, name);
2184 break;
2185 }
2186
2187 if (is_waiting_for_move(sctx, ino)) {
2188 ret = get_first_ref(sctx->parent_root, ino,
2189 &parent_inode, &parent_gen, name);
2190 } else {
2191 ret = __get_cur_name_and_parent(sctx, ino, gen,
2192 &parent_inode,
2193 &parent_gen, name);
2194 if (ret)
2195 stop = 1;
2196 }
2197
2156 if (ret < 0) 2198 if (ret < 0)
2157 goto out; 2199 goto out;
2158 if (ret)
2159 stop = 1;
2160
2161 if (!skip_name_cache &&
2162 is_waiting_for_move(sctx, parent_inode)) {
2163 ino = start_ino;
2164 gen = start_gen;
2165 stop = 0;
2166 skip_name_cache = 1;
2167 goto again;
2168 }
2169 2200
2170 ret = fs_path_add_path(dest, name); 2201 ret = fs_path_add_path(dest, name);
2171 if (ret < 0) 2202 if (ret < 0)
@@ -2429,10 +2460,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2429 if (!p) 2460 if (!p)
2430 return -ENOMEM; 2461 return -ENOMEM;
2431 2462
2432 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, 2463 if (ino != sctx->cur_ino) {
2433 NULL, &rdev); 2464 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2434 if (ret < 0) 2465 NULL, NULL, &rdev);
2435 goto out; 2466 if (ret < 0)
2467 goto out;
2468 } else {
2469 gen = sctx->cur_inode_gen;
2470 mode = sctx->cur_inode_mode;
2471 rdev = sctx->cur_inode_rdev;
2472 }
2436 2473
2437 if (S_ISREG(mode)) { 2474 if (S_ISREG(mode)) {
2438 cmd = BTRFS_SEND_C_MKFILE; 2475 cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2549,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2512 key.objectid = dir; 2549 key.objectid = dir;
2513 key.type = BTRFS_DIR_INDEX_KEY; 2550 key.type = BTRFS_DIR_INDEX_KEY;
2514 key.offset = 0; 2551 key.offset = 0;
2552 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2553 if (ret < 0)
2554 goto out;
2555
2515 while (1) { 2556 while (1) {
2516 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, 2557 eb = path->nodes[0];
2517 1, 0); 2558 slot = path->slots[0];
2518 if (ret < 0) 2559 if (slot >= btrfs_header_nritems(eb)) {
2519 goto out; 2560 ret = btrfs_next_leaf(sctx->send_root, path);
2520 if (!ret) { 2561 if (ret < 0) {
2521 eb = path->nodes[0]; 2562 goto out;
2522 slot = path->slots[0]; 2563 } else if (ret > 0) {
2523 btrfs_item_key_to_cpu(eb, &found_key, slot); 2564 ret = 0;
2565 break;
2566 }
2567 continue;
2524 } 2568 }
2525 if (ret || found_key.objectid != key.objectid || 2569
2570 btrfs_item_key_to_cpu(eb, &found_key, slot);
2571 if (found_key.objectid != key.objectid ||
2526 found_key.type != key.type) { 2572 found_key.type != key.type) {
2527 ret = 0; 2573 ret = 0;
2528 goto out; 2574 goto out;
@@ -2537,8 +2583,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2537 goto out; 2583 goto out;
2538 } 2584 }
2539 2585
2540 key.offset = found_key.offset + 1; 2586 path->slots[0]++;
2541 btrfs_release_path(path);
2542 } 2587 }
2543 2588
2544out: 2589out:
@@ -2590,7 +2635,7 @@ struct recorded_ref {
2590 * everything mixed. So we first record all refs and later process them. 2635 * everything mixed. So we first record all refs and later process them.
2591 * This function is a helper to record one ref. 2636 * This function is a helper to record one ref.
2592 */ 2637 */
2593static int record_ref(struct list_head *head, u64 dir, 2638static int __record_ref(struct list_head *head, u64 dir,
2594 u64 dir_gen, struct fs_path *path) 2639 u64 dir_gen, struct fs_path *path)
2595{ 2640{
2596 struct recorded_ref *ref; 2641 struct recorded_ref *ref;
@@ -2676,12 +2721,78 @@ out:
2676 return ret; 2721 return ret;
2677} 2722}
2678 2723
2724static struct orphan_dir_info *
2725add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2726{
2727 struct rb_node **p = &sctx->orphan_dirs.rb_node;
2728 struct rb_node *parent = NULL;
2729 struct orphan_dir_info *entry, *odi;
2730
2731 odi = kmalloc(sizeof(*odi), GFP_NOFS);
2732 if (!odi)
2733 return ERR_PTR(-ENOMEM);
2734 odi->ino = dir_ino;
2735 odi->gen = 0;
2736
2737 while (*p) {
2738 parent = *p;
2739 entry = rb_entry(parent, struct orphan_dir_info, node);
2740 if (dir_ino < entry->ino) {
2741 p = &(*p)->rb_left;
2742 } else if (dir_ino > entry->ino) {
2743 p = &(*p)->rb_right;
2744 } else {
2745 kfree(odi);
2746 return entry;
2747 }
2748 }
2749
2750 rb_link_node(&odi->node, parent, p);
2751 rb_insert_color(&odi->node, &sctx->orphan_dirs);
2752 return odi;
2753}
2754
2755static struct orphan_dir_info *
2756get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2757{
2758 struct rb_node *n = sctx->orphan_dirs.rb_node;
2759 struct orphan_dir_info *entry;
2760
2761 while (n) {
2762 entry = rb_entry(n, struct orphan_dir_info, node);
2763 if (dir_ino < entry->ino)
2764 n = n->rb_left;
2765 else if (dir_ino > entry->ino)
2766 n = n->rb_right;
2767 else
2768 return entry;
2769 }
2770 return NULL;
2771}
2772
2773static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
2774{
2775 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
2776
2777 return odi != NULL;
2778}
2779
2780static void free_orphan_dir_info(struct send_ctx *sctx,
2781 struct orphan_dir_info *odi)
2782{
2783 if (!odi)
2784 return;
2785 rb_erase(&odi->node, &sctx->orphan_dirs);
2786 kfree(odi);
2787}
2788
2679/* 2789/*
2680 * Returns 1 if a directory can be removed at this point in time. 2790 * Returns 1 if a directory can be removed at this point in time.
2681 * We check this by iterating all dir items and checking if the inode behind 2791 * We check this by iterating all dir items and checking if the inode behind
2682 * the dir item was already processed. 2792 * the dir item was already processed.
2683 */ 2793 */
2684static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) 2794static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2795 u64 send_progress)
2685{ 2796{
2686 int ret = 0; 2797 int ret = 0;
2687 struct btrfs_root *root = sctx->parent_root; 2798 struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2815,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2704 key.objectid = dir; 2815 key.objectid = dir;
2705 key.type = BTRFS_DIR_INDEX_KEY; 2816 key.type = BTRFS_DIR_INDEX_KEY;
2706 key.offset = 0; 2817 key.offset = 0;
2818 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2819 if (ret < 0)
2820 goto out;
2707 2821
2708 while (1) { 2822 while (1) {
2709 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2823 struct waiting_dir_move *dm;
2710 if (ret < 0) 2824
2711 goto out; 2825 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2712 if (!ret) { 2826 ret = btrfs_next_leaf(root, path);
2713 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2827 if (ret < 0)
2714 path->slots[0]); 2828 goto out;
2829 else if (ret > 0)
2830 break;
2831 continue;
2715 } 2832 }
2716 if (ret || found_key.objectid != key.objectid || 2833 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2717 found_key.type != key.type) { 2834 path->slots[0]);
2835 if (found_key.objectid != key.objectid ||
2836 found_key.type != key.type)
2718 break; 2837 break;
2719 }
2720 2838
2721 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 2839 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2722 struct btrfs_dir_item); 2840 struct btrfs_dir_item);
2723 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 2841 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2724 2842
2843 dm = get_waiting_dir_move(sctx, loc.objectid);
2844 if (dm) {
2845 struct orphan_dir_info *odi;
2846
2847 odi = add_orphan_dir_info(sctx, dir);
2848 if (IS_ERR(odi)) {
2849 ret = PTR_ERR(odi);
2850 goto out;
2851 }
2852 odi->gen = dir_gen;
2853 dm->rmdir_ino = dir;
2854 ret = 0;
2855 goto out;
2856 }
2857
2725 if (loc.objectid > send_progress) { 2858 if (loc.objectid > send_progress) {
2726 ret = 0; 2859 ret = 0;
2727 goto out; 2860 goto out;
2728 } 2861 }
2729 2862
2730 btrfs_release_path(path); 2863 path->slots[0]++;
2731 key.offset = found_key.offset + 1;
2732 } 2864 }
2733 2865
2734 ret = 1; 2866 ret = 1;
@@ -2740,19 +2872,9 @@ out:
2740 2872
2741static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 2873static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
2742{ 2874{
2743 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2875 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
2744 struct waiting_dir_move *entry;
2745 2876
2746 while (n) { 2877 return entry != NULL;
2747 entry = rb_entry(n, struct waiting_dir_move, node);
2748 if (ino < entry->ino)
2749 n = n->rb_left;
2750 else if (ino > entry->ino)
2751 n = n->rb_right;
2752 else
2753 return 1;
2754 }
2755 return 0;
2756} 2878}
2757 2879
2758static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2880static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2887,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2765 if (!dm) 2887 if (!dm)
2766 return -ENOMEM; 2888 return -ENOMEM;
2767 dm->ino = ino; 2889 dm->ino = ino;
2890 dm->rmdir_ino = 0;
2768 2891
2769 while (*p) { 2892 while (*p) {
2770 parent = *p; 2893 parent = *p;
@@ -2784,31 +2907,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2784 return 0; 2907 return 0;
2785} 2908}
2786 2909
2787static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2910static struct waiting_dir_move *
2911get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2788{ 2912{
2789 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2913 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2790 struct waiting_dir_move *entry; 2914 struct waiting_dir_move *entry;
2791 2915
2792 while (n) { 2916 while (n) {
2793 entry = rb_entry(n, struct waiting_dir_move, node); 2917 entry = rb_entry(n, struct waiting_dir_move, node);
2794 if (ino < entry->ino) { 2918 if (ino < entry->ino)
2795 n = n->rb_left; 2919 n = n->rb_left;
2796 } else if (ino > entry->ino) { 2920 else if (ino > entry->ino)
2797 n = n->rb_right; 2921 n = n->rb_right;
2798 } else { 2922 else
2799 rb_erase(&entry->node, &sctx->waiting_dir_moves); 2923 return entry;
2800 kfree(entry);
2801 return 0;
2802 }
2803 } 2924 }
2804 return -ENOENT; 2925 return NULL;
2805} 2926}
2806 2927
2807static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) 2928static void free_waiting_dir_move(struct send_ctx *sctx,
2929 struct waiting_dir_move *dm)
2930{
2931 if (!dm)
2932 return;
2933 rb_erase(&dm->node, &sctx->waiting_dir_moves);
2934 kfree(dm);
2935}
2936
2937static int add_pending_dir_move(struct send_ctx *sctx,
2938 u64 ino,
2939 u64 ino_gen,
2940 u64 parent_ino)
2808{ 2941{
2809 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2942 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2810 struct rb_node *parent = NULL; 2943 struct rb_node *parent = NULL;
2811 struct pending_dir_move *entry, *pm; 2944 struct pending_dir_move *entry = NULL, *pm;
2812 struct recorded_ref *cur; 2945 struct recorded_ref *cur;
2813 int exists = 0; 2946 int exists = 0;
2814 int ret; 2947 int ret;
@@ -2817,8 +2950,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
2817 if (!pm) 2950 if (!pm)
2818 return -ENOMEM; 2951 return -ENOMEM;
2819 pm->parent_ino = parent_ino; 2952 pm->parent_ino = parent_ino;
2820 pm->ino = sctx->cur_ino; 2953 pm->ino = ino;
2821 pm->gen = sctx->cur_inode_gen; 2954 pm->gen = ino_gen;
2822 INIT_LIST_HEAD(&pm->list); 2955 INIT_LIST_HEAD(&pm->list);
2823 INIT_LIST_HEAD(&pm->update_refs); 2956 INIT_LIST_HEAD(&pm->update_refs);
2824 RB_CLEAR_NODE(&pm->node); 2957 RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3021,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2888{ 3021{
2889 struct fs_path *from_path = NULL; 3022 struct fs_path *from_path = NULL;
2890 struct fs_path *to_path = NULL; 3023 struct fs_path *to_path = NULL;
3024 struct fs_path *name = NULL;
2891 u64 orig_progress = sctx->send_progress; 3025 u64 orig_progress = sctx->send_progress;
2892 struct recorded_ref *cur; 3026 struct recorded_ref *cur;
3027 u64 parent_ino, parent_gen;
3028 struct waiting_dir_move *dm = NULL;
3029 u64 rmdir_ino = 0;
2893 int ret; 3030 int ret;
2894 3031
3032 name = fs_path_alloc();
2895 from_path = fs_path_alloc(); 3033 from_path = fs_path_alloc();
2896 if (!from_path) 3034 if (!name || !from_path) {
2897 return -ENOMEM; 3035 ret = -ENOMEM;
3036 goto out;
3037 }
3038
3039 dm = get_waiting_dir_move(sctx, pm->ino);
3040 ASSERT(dm);
3041 rmdir_ino = dm->rmdir_ino;
3042 free_waiting_dir_move(sctx, dm);
2898 3043
2899 sctx->send_progress = pm->ino; 3044 ret = get_first_ref(sctx->parent_root, pm->ino,
2900 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3045 &parent_ino, &parent_gen, name);
2901 if (ret < 0) 3046 if (ret < 0)
2902 goto out; 3047 goto out;
2903 3048
3049 if (parent_ino == sctx->cur_ino) {
3050 /* child only renamed, not moved */
3051 ASSERT(parent_gen == sctx->cur_inode_gen);
3052 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3053 from_path);
3054 if (ret < 0)
3055 goto out;
3056 ret = fs_path_add_path(from_path, name);
3057 if (ret < 0)
3058 goto out;
3059 } else {
3060 /* child moved and maybe renamed too */
3061 sctx->send_progress = pm->ino;
3062 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
3063 if (ret < 0)
3064 goto out;
3065 }
3066
3067 fs_path_free(name);
3068 name = NULL;
3069
2904 to_path = fs_path_alloc(); 3070 to_path = fs_path_alloc();
2905 if (!to_path) { 3071 if (!to_path) {
2906 ret = -ENOMEM; 3072 ret = -ENOMEM;
@@ -2908,9 +3074,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2908 } 3074 }
2909 3075
2910 sctx->send_progress = sctx->cur_ino + 1; 3076 sctx->send_progress = sctx->cur_ino + 1;
2911 ret = del_waiting_dir_move(sctx, pm->ino);
2912 ASSERT(ret == 0);
2913
2914 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3077 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
2915 if (ret < 0) 3078 if (ret < 0)
2916 goto out; 3079 goto out;
@@ -2919,6 +3082,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2919 if (ret < 0) 3082 if (ret < 0)
2920 goto out; 3083 goto out;
2921 3084
3085 if (rmdir_ino) {
3086 struct orphan_dir_info *odi;
3087
3088 odi = get_orphan_dir_info(sctx, rmdir_ino);
3089 if (!odi) {
3090 /* already deleted */
3091 goto finish;
3092 }
3093 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
3094 if (ret < 0)
3095 goto out;
3096 if (!ret)
3097 goto finish;
3098
3099 name = fs_path_alloc();
3100 if (!name) {
3101 ret = -ENOMEM;
3102 goto out;
3103 }
3104 ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
3105 if (ret < 0)
3106 goto out;
3107 ret = send_rmdir(sctx, name);
3108 if (ret < 0)
3109 goto out;
3110 free_orphan_dir_info(sctx, odi);
3111 }
3112
3113finish:
2922 ret = send_utimes(sctx, pm->ino, pm->gen); 3114 ret = send_utimes(sctx, pm->ino, pm->gen);
2923 if (ret < 0) 3115 if (ret < 0)
2924 goto out; 3116 goto out;
@@ -2928,12 +3120,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2928 * and old parent(s). 3120 * and old parent(s).
2929 */ 3121 */
2930 list_for_each_entry(cur, &pm->update_refs, list) { 3122 list_for_each_entry(cur, &pm->update_refs, list) {
3123 if (cur->dir == rmdir_ino)
3124 continue;
2931 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3125 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
2932 if (ret < 0) 3126 if (ret < 0)
2933 goto out; 3127 goto out;
2934 } 3128 }
2935 3129
2936out: 3130out:
3131 fs_path_free(name);
2937 fs_path_free(from_path); 3132 fs_path_free(from_path);
2938 fs_path_free(to_path); 3133 fs_path_free(to_path);
2939 sctx->send_progress = orig_progress; 3134 sctx->send_progress = orig_progress;
@@ -3005,17 +3200,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3005 int ret; 3200 int ret;
3006 u64 ino = parent_ref->dir; 3201 u64 ino = parent_ref->dir;
3007 u64 parent_ino_before, parent_ino_after; 3202 u64 parent_ino_before, parent_ino_after;
3008 u64 new_gen, old_gen; 3203 u64 old_gen;
3009 struct fs_path *path_before = NULL; 3204 struct fs_path *path_before = NULL;
3010 struct fs_path *path_after = NULL; 3205 struct fs_path *path_after = NULL;
3011 int len1, len2; 3206 int len1, len2;
3012 3207 int register_upper_dirs;
3013 if (parent_ref->dir <= sctx->cur_ino) 3208 u64 gen;
3014 return 0;
3015 3209
3016 if (is_waiting_for_move(sctx, ino)) 3210 if (is_waiting_for_move(sctx, ino))
3017 return 1; 3211 return 1;
3018 3212
3213 if (parent_ref->dir <= sctx->cur_ino)
3214 return 0;
3215
3019 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, 3216 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3020 NULL, NULL, NULL, NULL); 3217 NULL, NULL, NULL, NULL);
3021 if (ret == -ENOENT) 3218 if (ret == -ENOENT)
@@ -3023,12 +3220,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3023 else if (ret < 0) 3220 else if (ret < 0)
3024 return ret; 3221 return ret;
3025 3222
3026 ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, 3223 if (parent_ref->dir_gen != old_gen)
3027 NULL, NULL, NULL, NULL);
3028 if (ret < 0)
3029 return ret;
3030
3031 if (new_gen != old_gen)
3032 return 0; 3224 return 0;
3033 3225
3034 path_before = fs_path_alloc(); 3226 path_before = fs_path_alloc();
@@ -3051,7 +3243,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3051 } 3243 }
3052 3244
3053 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3245 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3054 NULL, path_after); 3246 &gen, path_after);
3055 if (ret == -ENOENT) { 3247 if (ret == -ENOENT) {
3056 ret = 0; 3248 ret = 0;
3057 goto out; 3249 goto out;
@@ -3061,13 +3253,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3061 3253
3062 len1 = fs_path_len(path_before); 3254 len1 = fs_path_len(path_before);
3063 len2 = fs_path_len(path_after); 3255 len2 = fs_path_len(path_after);
3064 if ((parent_ino_before != parent_ino_after) && (len1 != len2 || 3256 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3065 memcmp(path_before->start, path_after->start, len1))) { 3257 memcmp(path_before->start, path_after->start, len1)) {
3066 ret = 1; 3258 ret = 1;
3067 goto out; 3259 goto out;
3068 } 3260 }
3069 ret = 0; 3261 ret = 0;
3070 3262
3263 /*
3264 * Ok, our new most direct ancestor has a higher inode number but
3265 * wasn't moved/renamed. So maybe some of the new ancestors higher in
3266 * the hierarchy have an higher inode number too *and* were renamed
3267 * or moved - in this case we need to wait for the ancestor's rename
3268 * or move operation before we can do the move/rename for the current
3269 * inode.
3270 */
3271 register_upper_dirs = 0;
3272 ino = parent_ino_after;
3273again:
3274 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
3275 u64 parent_gen;
3276
3277 fs_path_reset(path_before);
3278 fs_path_reset(path_after);
3279
3280 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3281 &parent_gen, path_after);
3282 if (ret < 0)
3283 goto out;
3284 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3285 NULL, path_before);
3286 if (ret == -ENOENT) {
3287 ret = 0;
3288 break;
3289 } else if (ret < 0) {
3290 goto out;
3291 }
3292
3293 len1 = fs_path_len(path_before);
3294 len2 = fs_path_len(path_after);
3295 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3296 memcmp(path_before->start, path_after->start, len1)) {
3297 ret = 1;
3298 if (register_upper_dirs) {
3299 break;
3300 } else {
3301 register_upper_dirs = 1;
3302 ino = parent_ref->dir;
3303 gen = parent_ref->dir_gen;
3304 goto again;
3305 }
3306 } else if (register_upper_dirs) {
3307 ret = add_pending_dir_move(sctx, ino, gen,
3308 parent_ino_after);
3309 if (ret < 0 && ret != -EEXIST)
3310 goto out;
3311 }
3312
3313 ino = parent_ino_after;
3314 gen = parent_gen;
3315 }
3316
3071out: 3317out:
3072 fs_path_free(path_before); 3318 fs_path_free(path_before);
3073 fs_path_free(path_after); 3319 fs_path_free(path_after);
@@ -3089,6 +3335,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3089 u64 ow_gen; 3335 u64 ow_gen;
3090 int did_overwrite = 0; 3336 int did_overwrite = 0;
3091 int is_orphan = 0; 3337 int is_orphan = 0;
3338 u64 last_dir_ino_rm = 0;
3092 3339
3093verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 3340verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3094 3341
@@ -3227,9 +3474,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3227 * dirs, we always have one new and one deleted 3474 * dirs, we always have one new and one deleted
3228 * ref. The deleted ref is ignored later. 3475 * ref. The deleted ref is ignored later.
3229 */ 3476 */
3230 if (wait_for_parent_move(sctx, cur)) { 3477 ret = wait_for_parent_move(sctx, cur);
3478 if (ret < 0)
3479 goto out;
3480 if (ret) {
3231 ret = add_pending_dir_move(sctx, 3481 ret = add_pending_dir_move(sctx,
3232 cur->dir); 3482 sctx->cur_ino,
3483 sctx->cur_inode_gen,
3484 cur->dir);
3233 *pending_move = 1; 3485 *pending_move = 1;
3234 } else { 3486 } else {
3235 ret = send_rename(sctx, valid_path, 3487 ret = send_rename(sctx, valid_path,
@@ -3259,7 +3511,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3259 * later, we do this check again and rmdir it then if possible. 3511 * later, we do this check again and rmdir it then if possible.
3260 * See the use of check_dirs for more details. 3512 * See the use of check_dirs for more details.
3261 */ 3513 */
3262 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); 3514 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3515 sctx->cur_ino);
3263 if (ret < 0) 3516 if (ret < 0)
3264 goto out; 3517 goto out;
3265 if (ret) { 3518 if (ret) {
@@ -3350,8 +3603,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3350 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3603 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3351 if (ret < 0) 3604 if (ret < 0)
3352 goto out; 3605 goto out;
3353 } else if (ret == inode_state_did_delete) { 3606 } else if (ret == inode_state_did_delete &&
3354 ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); 3607 cur->dir != last_dir_ino_rm) {
3608 ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
3609 sctx->cur_ino);
3355 if (ret < 0) 3610 if (ret < 0)
3356 goto out; 3611 goto out;
3357 if (ret) { 3612 if (ret) {
@@ -3362,6 +3617,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3362 ret = send_rmdir(sctx, valid_path); 3617 ret = send_rmdir(sctx, valid_path);
3363 if (ret < 0) 3618 if (ret < 0)
3364 goto out; 3619 goto out;
3620 last_dir_ino_rm = cur->dir;
3365 } 3621 }
3366 } 3622 }
3367 } 3623 }
@@ -3375,9 +3631,8 @@ out:
3375 return ret; 3631 return ret;
3376} 3632}
3377 3633
3378static int __record_new_ref(int num, u64 dir, int index, 3634static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
3379 struct fs_path *name, 3635 struct fs_path *name, void *ctx, struct list_head *refs)
3380 void *ctx)
3381{ 3636{
3382 int ret = 0; 3637 int ret = 0;
3383 struct send_ctx *sctx = ctx; 3638 struct send_ctx *sctx = ctx;
@@ -3388,7 +3643,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3388 if (!p) 3643 if (!p)
3389 return -ENOMEM; 3644 return -ENOMEM;
3390 3645
3391 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3646 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
3392 NULL, NULL); 3647 NULL, NULL);
3393 if (ret < 0) 3648 if (ret < 0)
3394 goto out; 3649 goto out;
@@ -3400,7 +3655,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3400 if (ret < 0) 3655 if (ret < 0)
3401 goto out; 3656 goto out;
3402 3657
3403 ret = record_ref(&sctx->new_refs, dir, gen, p); 3658 ret = __record_ref(refs, dir, gen, p);
3404 3659
3405out: 3660out:
3406 if (ret) 3661 if (ret)
@@ -3408,37 +3663,23 @@ out:
3408 return ret; 3663 return ret;
3409} 3664}
3410 3665
3666static int __record_new_ref(int num, u64 dir, int index,
3667 struct fs_path *name,
3668 void *ctx)
3669{
3670 struct send_ctx *sctx = ctx;
3671 return record_ref(sctx->send_root, num, dir, index, name,
3672 ctx, &sctx->new_refs);
3673}
3674
3675
3411static int __record_deleted_ref(int num, u64 dir, int index, 3676static int __record_deleted_ref(int num, u64 dir, int index,
3412 struct fs_path *name, 3677 struct fs_path *name,
3413 void *ctx) 3678 void *ctx)
3414{ 3679{
3415 int ret = 0;
3416 struct send_ctx *sctx = ctx; 3680 struct send_ctx *sctx = ctx;
3417 struct fs_path *p; 3681 return record_ref(sctx->parent_root, num, dir, index, name,
3418 u64 gen; 3682 ctx, &sctx->deleted_refs);
3419
3420 p = fs_path_alloc();
3421 if (!p)
3422 return -ENOMEM;
3423
3424 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3425 NULL, NULL);
3426 if (ret < 0)
3427 goto out;
3428
3429 ret = get_cur_path(sctx, dir, gen, p);
3430 if (ret < 0)
3431 goto out;
3432 ret = fs_path_add_path(p, name);
3433 if (ret < 0)
3434 goto out;
3435
3436 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3437
3438out:
3439 if (ret)
3440 fs_path_free(p);
3441 return ret;
3442} 3683}
3443 3684
3444static int record_new_ref(struct send_ctx *sctx) 3685static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3860,31 @@ static int process_all_refs(struct send_ctx *sctx,
3619 root = sctx->parent_root; 3860 root = sctx->parent_root;
3620 cb = __record_deleted_ref; 3861 cb = __record_deleted_ref;
3621 } else { 3862 } else {
3622 BUG(); 3863 btrfs_err(sctx->send_root->fs_info,
3864 "Wrong command %d in process_all_refs", cmd);
3865 ret = -EINVAL;
3866 goto out;
3623 } 3867 }
3624 3868
3625 key.objectid = sctx->cmp_key->objectid; 3869 key.objectid = sctx->cmp_key->objectid;
3626 key.type = BTRFS_INODE_REF_KEY; 3870 key.type = BTRFS_INODE_REF_KEY;
3627 key.offset = 0; 3871 key.offset = 0;
3628 while (1) { 3872 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3629 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3873 if (ret < 0)
3630 if (ret < 0) 3874 goto out;
3631 goto out;
3632 if (ret)
3633 break;
3634 3875
3876 while (1) {
3635 eb = path->nodes[0]; 3877 eb = path->nodes[0];
3636 slot = path->slots[0]; 3878 slot = path->slots[0];
3879 if (slot >= btrfs_header_nritems(eb)) {
3880 ret = btrfs_next_leaf(root, path);
3881 if (ret < 0)
3882 goto out;
3883 else if (ret > 0)
3884 break;
3885 continue;
3886 }
3887
3637 btrfs_item_key_to_cpu(eb, &found_key, slot); 3888 btrfs_item_key_to_cpu(eb, &found_key, slot);
3638 3889
3639 if (found_key.objectid != key.objectid || 3890 if (found_key.objectid != key.objectid ||
@@ -3642,11 +3893,10 @@ static int process_all_refs(struct send_ctx *sctx,
3642 break; 3893 break;
3643 3894
3644 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 3895 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3645 btrfs_release_path(path);
3646 if (ret < 0) 3896 if (ret < 0)
3647 goto out; 3897 goto out;
3648 3898
3649 key.offset = found_key.offset + 1; 3899 path->slots[0]++;
3650 } 3900 }
3651 btrfs_release_path(path); 3901 btrfs_release_path(path);
3652 3902
@@ -3927,19 +4177,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3927 key.objectid = sctx->cmp_key->objectid; 4177 key.objectid = sctx->cmp_key->objectid;
3928 key.type = BTRFS_XATTR_ITEM_KEY; 4178 key.type = BTRFS_XATTR_ITEM_KEY;
3929 key.offset = 0; 4179 key.offset = 0;
3930 while (1) { 4180 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3931 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 4181 if (ret < 0)
3932 if (ret < 0) 4182 goto out;
3933 goto out;
3934 if (ret) {
3935 ret = 0;
3936 goto out;
3937 }
3938 4183
4184 while (1) {
3939 eb = path->nodes[0]; 4185 eb = path->nodes[0];
3940 slot = path->slots[0]; 4186 slot = path->slots[0];
3941 btrfs_item_key_to_cpu(eb, &found_key, slot); 4187 if (slot >= btrfs_header_nritems(eb)) {
4188 ret = btrfs_next_leaf(root, path);
4189 if (ret < 0) {
4190 goto out;
4191 } else if (ret > 0) {
4192 ret = 0;
4193 break;
4194 }
4195 continue;
4196 }
3942 4197
4198 btrfs_item_key_to_cpu(eb, &found_key, slot);
3943 if (found_key.objectid != key.objectid || 4199 if (found_key.objectid != key.objectid ||
3944 found_key.type != key.type) { 4200 found_key.type != key.type) {
3945 ret = 0; 4201 ret = 0;
@@ -3951,8 +4207,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3951 if (ret < 0) 4207 if (ret < 0)
3952 goto out; 4208 goto out;
3953 4209
3954 btrfs_release_path(path); 4210 path->slots[0]++;
3955 key.offset = found_key.offset + 1;
3956 } 4211 }
3957 4212
3958out: 4213out:
@@ -3991,6 +4246,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
3991 goto out; 4246 goto out;
3992 4247
3993 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; 4248 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
4249
4250 /* initial readahead */
4251 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4252 file_ra_state_init(&sctx->ra, inode->i_mapping);
4253 btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
4254 last_index - index + 1);
4255
3994 while (index <= last_index) { 4256 while (index <= last_index) {
3995 unsigned cur_len = min_t(unsigned, len, 4257 unsigned cur_len = min_t(unsigned, len,
3996 PAGE_CACHE_SIZE - pg_offset); 4258 PAGE_CACHE_SIZE - pg_offset);
@@ -4174,6 +4436,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
4174 p = fs_path_alloc(); 4436 p = fs_path_alloc();
4175 if (!p) 4437 if (!p)
4176 return -ENOMEM; 4438 return -ENOMEM;
4439 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4440 if (ret < 0)
4441 goto tlv_put_failure;
4177 memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); 4442 memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
4178 while (offset < end) { 4443 while (offset < end) {
4179 len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); 4444 len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
@@ -4181,9 +4446,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
4181 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 4446 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
4182 if (ret < 0) 4447 if (ret < 0)
4183 break; 4448 break;
4184 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4185 if (ret < 0)
4186 break;
4187 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 4449 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
4188 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 4450 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
4189 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); 4451 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
@@ -4724,7 +4986,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4724 4986
4725 if (S_ISREG(sctx->cur_inode_mode)) { 4987 if (S_ISREG(sctx->cur_inode_mode)) {
4726 if (need_send_hole(sctx)) { 4988 if (need_send_hole(sctx)) {
4727 if (sctx->cur_inode_last_extent == (u64)-1) { 4989 if (sctx->cur_inode_last_extent == (u64)-1 ||
4990 sctx->cur_inode_last_extent <
4991 sctx->cur_inode_size) {
4728 ret = get_last_extent(sctx, (u64)-1); 4992 ret = get_last_extent(sctx, (u64)-1);
4729 if (ret) 4993 if (ret)
4730 goto out; 4994 goto out;
@@ -4763,18 +5027,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4763 ret = apply_children_dir_moves(sctx); 5027 ret = apply_children_dir_moves(sctx);
4764 if (ret) 5028 if (ret)
4765 goto out; 5029 goto out;
5030 /*
5031 * Need to send that every time, no matter if it actually
5032 * changed between the two trees as we have done changes to
5033 * the inode before. If our inode is a directory and it's
5034 * waiting to be moved/renamed, we will send its utimes when
5035 * it's moved/renamed, therefore we don't need to do it here.
5036 */
5037 sctx->send_progress = sctx->cur_ino + 1;
5038 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
5039 if (ret < 0)
5040 goto out;
4766 } 5041 }
4767 5042
4768 /*
4769 * Need to send that every time, no matter if it actually
4770 * changed between the two trees as we have done changes to
4771 * the inode before.
4772 */
4773 sctx->send_progress = sctx->cur_ino + 1;
4774 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4775 if (ret < 0)
4776 goto out;
4777
4778out: 5043out:
4779 return ret; 5044 return ret;
4780} 5045}
@@ -4840,6 +5105,8 @@ static int changed_inode(struct send_ctx *sctx,
4840 sctx->left_path->nodes[0], left_ii); 5105 sctx->left_path->nodes[0], left_ii);
4841 sctx->cur_inode_mode = btrfs_inode_mode( 5106 sctx->cur_inode_mode = btrfs_inode_mode(
4842 sctx->left_path->nodes[0], left_ii); 5107 sctx->left_path->nodes[0], left_ii);
5108 sctx->cur_inode_rdev = btrfs_inode_rdev(
5109 sctx->left_path->nodes[0], left_ii);
4843 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 5110 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4844 ret = send_create_inode_if_needed(sctx); 5111 ret = send_create_inode_if_needed(sctx);
4845 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 5112 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5151,8 @@ static int changed_inode(struct send_ctx *sctx,
4884 sctx->left_path->nodes[0], left_ii); 5151 sctx->left_path->nodes[0], left_ii);
4885 sctx->cur_inode_mode = btrfs_inode_mode( 5152 sctx->cur_inode_mode = btrfs_inode_mode(
4886 sctx->left_path->nodes[0], left_ii); 5153 sctx->left_path->nodes[0], left_ii);
5154 sctx->cur_inode_rdev = btrfs_inode_rdev(
5155 sctx->left_path->nodes[0], left_ii);
4887 ret = send_create_inode_if_needed(sctx); 5156 ret = send_create_inode_if_needed(sctx);
4888 if (ret < 0) 5157 if (ret < 0)
4889 goto out; 5158 goto out;
@@ -5124,37 +5393,15 @@ static int full_send_tree(struct send_ctx *sctx)
5124 struct btrfs_path *path; 5393 struct btrfs_path *path;
5125 struct extent_buffer *eb; 5394 struct extent_buffer *eb;
5126 int slot; 5395 int slot;
5127 u64 start_ctransid;
5128 u64 ctransid;
5129 5396
5130 path = alloc_path_for_send(); 5397 path = alloc_path_for_send();
5131 if (!path) 5398 if (!path)
5132 return -ENOMEM; 5399 return -ENOMEM;
5133 5400
5134 spin_lock(&send_root->root_item_lock);
5135 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
5136 spin_unlock(&send_root->root_item_lock);
5137
5138 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 5401 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
5139 key.type = BTRFS_INODE_ITEM_KEY; 5402 key.type = BTRFS_INODE_ITEM_KEY;
5140 key.offset = 0; 5403 key.offset = 0;
5141 5404
5142 /*
5143 * Make sure the tree has not changed after re-joining. We detect this
5144 * by comparing start_ctransid and ctransid. They should always match.
5145 */
5146 spin_lock(&send_root->root_item_lock);
5147 ctransid = btrfs_root_ctransid(&send_root->root_item);
5148 spin_unlock(&send_root->root_item_lock);
5149
5150 if (ctransid != start_ctransid) {
5151 WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
5152 "send was modified in between. This is "
5153 "probably a bug.\n");
5154 ret = -EIO;
5155 goto out;
5156 }
5157
5158 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); 5405 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
5159 if (ret < 0) 5406 if (ret < 0)
5160 goto out; 5407 goto out;
@@ -5340,6 +5587,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5340 5587
5341 sctx->pending_dir_moves = RB_ROOT; 5588 sctx->pending_dir_moves = RB_ROOT;
5342 sctx->waiting_dir_moves = RB_ROOT; 5589 sctx->waiting_dir_moves = RB_ROOT;
5590 sctx->orphan_dirs = RB_ROOT;
5343 5591
5344 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 5592 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
5345 (arg->clone_sources_count + 1)); 5593 (arg->clone_sources_count + 1));
@@ -5435,7 +5683,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5435 NULL); 5683 NULL);
5436 sort_clone_roots = 1; 5684 sort_clone_roots = 1;
5437 5685
5686 current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
5438 ret = send_subvol(sctx); 5687 ret = send_subvol(sctx);
5688 current->journal_info = NULL;
5439 if (ret < 0) 5689 if (ret < 0)
5440 goto out; 5690 goto out;
5441 5691
@@ -5477,6 +5727,16 @@ out:
5477 kfree(dm); 5727 kfree(dm);
5478 } 5728 }
5479 5729
5730 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
5731 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
5732 struct rb_node *n;
5733 struct orphan_dir_info *odi;
5734
5735 n = rb_first(&sctx->orphan_dirs);
5736 odi = rb_entry(n, struct orphan_dir_info, node);
5737 free_orphan_dir_info(sctx, odi);
5738 }
5739
5480 if (sort_clone_roots) { 5740 if (sort_clone_roots) {
5481 for (i = 0; i < sctx->clone_roots_cnt; i++) 5741 for (i = 0; i < sctx->clone_roots_cnt; i++)
5482 btrfs_root_dec_send_in_progress( 5742 btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..5011aadacab8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,8 @@
66static const struct super_operations btrfs_super_ops; 66static const struct super_operations btrfs_super_ops;
67static struct file_system_type btrfs_fs_type; 67static struct file_system_type btrfs_fs_type;
68 68
69static int btrfs_remount(struct super_block *sb, int *flags, char *data);
70
69static const char *btrfs_decode_error(int errno) 71static const char *btrfs_decode_error(int errno)
70{ 72{
71 char *errstr = "unknown"; 73 char *errstr = "unknown";
@@ -1185,6 +1187,26 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
1185 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, 1187 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
1186 newargs); 1188 newargs);
1187 kfree(newargs); 1189 kfree(newargs);
1190
1191 if (PTR_RET(mnt) == -EBUSY) {
1192 if (flags & MS_RDONLY) {
1193 mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
1194 newargs);
1195 } else {
1196 int r;
1197 mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
1198 newargs);
1199 if (IS_ERR(mnt))
1200 return ERR_CAST(mnt);
1201
1202 r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
1203 if (r < 0) {
1204 /* FIXME: release vfsmount mnt ??*/
1205 return ERR_PTR(r);
1206 }
1207 }
1208 }
1209
1188 if (IS_ERR(mnt)) 1210 if (IS_ERR(mnt))
1189 return ERR_CAST(mnt); 1211 return ERR_CAST(mnt);
1190 1212
@@ -1305,13 +1327,6 @@ error_fs_info:
1305 return ERR_PTR(error); 1327 return ERR_PTR(error);
1306} 1328}
1307 1329
1308static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1309{
1310 spin_lock_irq(&workers->lock);
1311 workers->max_workers = new_limit;
1312 spin_unlock_irq(&workers->lock);
1313}
1314
1315static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, 1330static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1316 int new_pool_size, int old_pool_size) 1331 int new_pool_size, int old_pool_size)
1317{ 1332{
@@ -1323,21 +1338,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1323 btrfs_info(fs_info, "resize thread pool %d -> %d", 1338 btrfs_info(fs_info, "resize thread pool %d -> %d",
1324 old_pool_size, new_pool_size); 1339 old_pool_size, new_pool_size);
1325 1340
1326 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1341 btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1327 btrfs_set_max_workers(&fs_info->workers, new_pool_size); 1342 btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1328 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); 1343 btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
1329 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); 1344 btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1330 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); 1345 btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
1331 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); 1346 btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
1332 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); 1347 btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
1333 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); 1348 new_pool_size);
1334 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); 1349 btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
1335 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); 1350 btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1336 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1351 btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1337 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1352 btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1338 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1353 btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
1339 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1354 new_pool_size);
1340 new_pool_size);
1341} 1355}
1342 1356
1343static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) 1357static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1388,6 +1402,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1388 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1402 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1389 int ret; 1403 int ret;
1390 1404
1405 sync_filesystem(sb);
1391 btrfs_remount_prepare(fs_info); 1406 btrfs_remount_prepare(fs_info);
1392 1407
1393 ret = btrfs_parse_options(root, data); 1408 ret = btrfs_parse_options(root, data);
@@ -1479,6 +1494,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1479 sb->s_flags &= ~MS_RDONLY; 1494 sb->s_flags &= ~MS_RDONLY;
1480 } 1495 }
1481out: 1496out:
1497 wake_up_process(fs_info->transaction_kthread);
1482 btrfs_remount_cleanup(fs_info, old_opts); 1498 btrfs_remount_cleanup(fs_info, old_opts);
1483 return 0; 1499 return 0;
1484 1500
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
24#include <linux/kobject.h> 24#include <linux/kobject.h>
25#include <linux/bug.h> 25#include <linux/bug.h>
26#include <linux/genhd.h> 26#include <linux/genhd.h>
27#include <linux/debugfs.h>
27 28
28#include "ctree.h" 29#include "ctree.h"
29#include "disk-io.h" 30#include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
599/* /sys/fs/btrfs/ entry */ 600/* /sys/fs/btrfs/ entry */
600static struct kset *btrfs_kset; 601static struct kset *btrfs_kset;
601 602
603/* /sys/kernel/debug/btrfs */
604static struct dentry *btrfs_debugfs_root_dentry;
605
606/* Debugging tunables and exported data */
607u64 btrfs_debugfs_test;
608
602int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 609int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
603{ 610{
604 int error; 611 int error;
@@ -642,27 +649,41 @@ failure:
642 return error; 649 return error;
643} 650}
644 651
652static int btrfs_init_debugfs(void)
653{
654#ifdef CONFIG_DEBUG_FS
655 btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
656 if (!btrfs_debugfs_root_dentry)
657 return -ENOMEM;
658
659 debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
660 &btrfs_debugfs_test);
661#endif
662 return 0;
663}
664
645int btrfs_init_sysfs(void) 665int btrfs_init_sysfs(void)
646{ 666{
647 int ret; 667 int ret;
668
648 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 669 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
649 if (!btrfs_kset) 670 if (!btrfs_kset)
650 return -ENOMEM; 671 return -ENOMEM;
651 672
652 init_feature_attrs(); 673 ret = btrfs_init_debugfs();
674 if (ret)
675 return ret;
653 676
677 init_feature_attrs();
654 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 678 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
655 if (ret) {
656 kset_unregister(btrfs_kset);
657 return ret;
658 }
659 679
660 return 0; 680 return ret;
661} 681}
662 682
663void btrfs_exit_sysfs(void) 683void btrfs_exit_sysfs(void)
664{ 684{
665 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 685 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
666 kset_unregister(btrfs_kset); 686 kset_unregister(btrfs_kset);
687 debugfs_remove_recursive(btrfs_debugfs_root_dentry);
667} 688}
668 689
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
1#ifndef _BTRFS_SYSFS_H_ 1#ifndef _BTRFS_SYSFS_H_
2#define _BTRFS_SYSFS_H_ 2#define _BTRFS_SYSFS_H_
3 3
4/*
5 * Data exported through sysfs
6 */
7extern u64 btrfs_debugfs_test;
8
4enum btrfs_feature_set { 9enum btrfs_feature_set {
5 FEAT_COMPAT, 10 FEAT_COMPAT,
6 FEAT_COMPAT_RO, 11 FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..7579f6d0b854 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,10 +75,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
75 } 75 }
76} 76}
77 77
78static noinline void switch_commit_root(struct btrfs_root *root) 78static noinline void switch_commit_roots(struct btrfs_transaction *trans,
79 struct btrfs_fs_info *fs_info)
79{ 80{
80 free_extent_buffer(root->commit_root); 81 struct btrfs_root *root, *tmp;
81 root->commit_root = btrfs_root_node(root); 82
83 down_write(&fs_info->commit_root_sem);
84 list_for_each_entry_safe(root, tmp, &trans->switch_commits,
85 dirty_list) {
86 list_del_init(&root->dirty_list);
87 free_extent_buffer(root->commit_root);
88 root->commit_root = btrfs_root_node(root);
89 if (is_fstree(root->objectid))
90 btrfs_unpin_free_ino(root);
91 }
92 up_write(&fs_info->commit_root_sem);
82} 93}
83 94
84static inline void extwriter_counter_inc(struct btrfs_transaction *trans, 95static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
@@ -208,6 +219,7 @@ loop:
208 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 219 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
209 INIT_LIST_HEAD(&cur_trans->ordered_operations); 220 INIT_LIST_HEAD(&cur_trans->ordered_operations);
210 INIT_LIST_HEAD(&cur_trans->pending_chunks); 221 INIT_LIST_HEAD(&cur_trans->pending_chunks);
222 INIT_LIST_HEAD(&cur_trans->switch_commits);
211 list_add_tail(&cur_trans->list, &fs_info->trans_list); 223 list_add_tail(&cur_trans->list, &fs_info->trans_list);
212 extent_io_tree_init(&cur_trans->dirty_pages, 224 extent_io_tree_init(&cur_trans->dirty_pages,
213 fs_info->btree_inode->i_mapping); 225 fs_info->btree_inode->i_mapping);
@@ -375,7 +387,8 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
375 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 387 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
376 return ERR_PTR(-EROFS); 388 return ERR_PTR(-EROFS);
377 389
378 if (current->journal_info) { 390 if (current->journal_info &&
391 current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) {
379 WARN_ON(type & TRANS_EXTWRITERS); 392 WARN_ON(type & TRANS_EXTWRITERS);
380 h = current->journal_info; 393 h = current->journal_info;
381 h->use_count++; 394 h->use_count++;
@@ -683,7 +696,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
683 int lock = (trans->type != TRANS_JOIN_NOLOCK); 696 int lock = (trans->type != TRANS_JOIN_NOLOCK);
684 int err = 0; 697 int err = 0;
685 698
686 if (--trans->use_count) { 699 if (trans->use_count > 1) {
700 trans->use_count--;
687 trans->block_rsv = trans->orig_rsv; 701 trans->block_rsv = trans->orig_rsv;
688 return 0; 702 return 0;
689 } 703 }
@@ -731,17 +745,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
731 } 745 }
732 746
733 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { 747 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
734 if (throttle) { 748 if (throttle)
735 /*
736 * We may race with somebody else here so end up having
737 * to call end_transaction on ourselves again, so inc
738 * our use_count.
739 */
740 trans->use_count++;
741 return btrfs_commit_transaction(trans, root); 749 return btrfs_commit_transaction(trans, root);
742 } else { 750 else
743 wake_up_process(info->transaction_kthread); 751 wake_up_process(info->transaction_kthread);
744 }
745 } 752 }
746 753
747 if (trans->type & __TRANS_FREEZABLE) 754 if (trans->type & __TRANS_FREEZABLE)
@@ -925,9 +932,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
925 return ret; 932 return ret;
926 } 933 }
927 934
928 if (root != root->fs_info->extent_root)
929 switch_commit_root(root);
930
931 return 0; 935 return 0;
932} 936}
933 937
@@ -983,15 +987,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
983 list_del_init(next); 987 list_del_init(next);
984 root = list_entry(next, struct btrfs_root, dirty_list); 988 root = list_entry(next, struct btrfs_root, dirty_list);
985 989
990 if (root != fs_info->extent_root)
991 list_add_tail(&root->dirty_list,
992 &trans->transaction->switch_commits);
986 ret = update_cowonly_root(trans, root); 993 ret = update_cowonly_root(trans, root);
987 if (ret) 994 if (ret)
988 return ret; 995 return ret;
989 } 996 }
990 997
991 down_write(&fs_info->extent_commit_sem); 998 list_add_tail(&fs_info->extent_root->dirty_list,
992 switch_commit_root(fs_info->extent_root); 999 &trans->transaction->switch_commits);
993 up_write(&fs_info->extent_commit_sem);
994
995 btrfs_after_dev_replace_commit(fs_info); 1000 btrfs_after_dev_replace_commit(fs_info);
996 1001
997 return 0; 1002 return 0;
@@ -1048,11 +1053,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
1048 smp_wmb(); 1053 smp_wmb();
1049 1054
1050 if (root->commit_root != root->node) { 1055 if (root->commit_root != root->node) {
1051 mutex_lock(&root->fs_commit_mutex); 1056 list_add_tail(&root->dirty_list,
1052 switch_commit_root(root); 1057 &trans->transaction->switch_commits);
1053 btrfs_unpin_free_ino(root);
1054 mutex_unlock(&root->fs_commit_mutex);
1055
1056 btrfs_set_root_node(&root->root_item, 1058 btrfs_set_root_node(&root->root_item,
1057 root->node); 1059 root->node);
1058 } 1060 }
@@ -1578,10 +1580,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1578 1580
1579 trace_btrfs_transaction_commit(root); 1581 trace_btrfs_transaction_commit(root);
1580 1582
1581 btrfs_scrub_continue(root);
1582
1583 if (current->journal_info == trans) 1583 if (current->journal_info == trans)
1584 current->journal_info = NULL; 1584 current->journal_info = NULL;
1585 btrfs_scrub_cancel(root->fs_info);
1585 1586
1586 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1587 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1587} 1588}
@@ -1621,7 +1622,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1621static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1622static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1622{ 1623{
1623 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1624 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1624 return btrfs_start_delalloc_roots(fs_info, 1); 1625 return btrfs_start_delalloc_roots(fs_info, 1, -1);
1625 return 0; 1626 return 0;
1626} 1627}
1627 1628
@@ -1754,7 +1755,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 /* ->aborted might be set after the previous check, so check it */ 1755 /* ->aborted might be set after the previous check, so check it */
1755 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1756 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1756 ret = cur_trans->aborted; 1757 ret = cur_trans->aborted;
1757 goto cleanup_transaction; 1758 goto scrub_continue;
1758 } 1759 }
1759 /* 1760 /*
1760 * the reloc mutex makes sure that we stop 1761 * the reloc mutex makes sure that we stop
@@ -1771,7 +1772,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771 ret = create_pending_snapshots(trans, root->fs_info); 1772 ret = create_pending_snapshots(trans, root->fs_info);
1772 if (ret) { 1773 if (ret) {
1773 mutex_unlock(&root->fs_info->reloc_mutex); 1774 mutex_unlock(&root->fs_info->reloc_mutex);
1774 goto cleanup_transaction; 1775 goto scrub_continue;
1775 } 1776 }
1776 1777
1777 /* 1778 /*
@@ -1787,13 +1788,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1787 ret = btrfs_run_delayed_items(trans, root); 1788 ret = btrfs_run_delayed_items(trans, root);
1788 if (ret) { 1789 if (ret) {
1789 mutex_unlock(&root->fs_info->reloc_mutex); 1790 mutex_unlock(&root->fs_info->reloc_mutex);
1790 goto cleanup_transaction; 1791 goto scrub_continue;
1791 } 1792 }
1792 1793
1793 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1794 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1794 if (ret) { 1795 if (ret) {
1795 mutex_unlock(&root->fs_info->reloc_mutex); 1796 mutex_unlock(&root->fs_info->reloc_mutex);
1796 goto cleanup_transaction; 1797 goto scrub_continue;
1797 } 1798 }
1798 1799
1799 /* 1800 /*
@@ -1823,7 +1824,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1823 if (ret) { 1824 if (ret) {
1824 mutex_unlock(&root->fs_info->tree_log_mutex); 1825 mutex_unlock(&root->fs_info->tree_log_mutex);
1825 mutex_unlock(&root->fs_info->reloc_mutex); 1826 mutex_unlock(&root->fs_info->reloc_mutex);
1826 goto cleanup_transaction; 1827 goto scrub_continue;
1827 } 1828 }
1828 1829
1829 /* 1830 /*
@@ -1844,7 +1845,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1844 if (ret) { 1845 if (ret) {
1845 mutex_unlock(&root->fs_info->tree_log_mutex); 1846 mutex_unlock(&root->fs_info->tree_log_mutex);
1846 mutex_unlock(&root->fs_info->reloc_mutex); 1847 mutex_unlock(&root->fs_info->reloc_mutex);
1847 goto cleanup_transaction; 1848 goto scrub_continue;
1848 } 1849 }
1849 1850
1850 /* 1851 /*
@@ -1855,7 +1856,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1855 ret = cur_trans->aborted; 1856 ret = cur_trans->aborted;
1856 mutex_unlock(&root->fs_info->tree_log_mutex); 1857 mutex_unlock(&root->fs_info->tree_log_mutex);
1857 mutex_unlock(&root->fs_info->reloc_mutex); 1858 mutex_unlock(&root->fs_info->reloc_mutex);
1858 goto cleanup_transaction; 1859 goto scrub_continue;
1859 } 1860 }
1860 1861
1861 btrfs_prepare_extent_commit(trans, root); 1862 btrfs_prepare_extent_commit(trans, root);
@@ -1864,11 +1865,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1864 1865
1865 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1866 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1866 root->fs_info->tree_root->node); 1867 root->fs_info->tree_root->node);
1867 switch_commit_root(root->fs_info->tree_root); 1868 list_add_tail(&root->fs_info->tree_root->dirty_list,
1869 &cur_trans->switch_commits);
1868 1870
1869 btrfs_set_root_node(&root->fs_info->chunk_root->root_item, 1871 btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1870 root->fs_info->chunk_root->node); 1872 root->fs_info->chunk_root->node);
1871 switch_commit_root(root->fs_info->chunk_root); 1873 list_add_tail(&root->fs_info->chunk_root->dirty_list,
1874 &cur_trans->switch_commits);
1875
1876 switch_commit_roots(cur_trans, root->fs_info);
1872 1877
1873 assert_qgroups_uptodate(trans); 1878 assert_qgroups_uptodate(trans);
1874 update_super_roots(root); 1879 update_super_roots(root);
@@ -1891,13 +1896,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1891 btrfs_error(root->fs_info, ret, 1896 btrfs_error(root->fs_info, ret,
1892 "Error while writing out transaction"); 1897 "Error while writing out transaction");
1893 mutex_unlock(&root->fs_info->tree_log_mutex); 1898 mutex_unlock(&root->fs_info->tree_log_mutex);
1894 goto cleanup_transaction; 1899 goto scrub_continue;
1895 } 1900 }
1896 1901
1897 ret = write_ctree_super(trans, root, 0); 1902 ret = write_ctree_super(trans, root, 0);
1898 if (ret) { 1903 if (ret) {
1899 mutex_unlock(&root->fs_info->tree_log_mutex); 1904 mutex_unlock(&root->fs_info->tree_log_mutex);
1900 goto cleanup_transaction; 1905 goto scrub_continue;
1901 } 1906 }
1902 1907
1903 /* 1908 /*
@@ -1940,6 +1945,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1940 1945
1941 return ret; 1946 return ret;
1942 1947
1948scrub_continue:
1949 btrfs_scrub_continue(root);
1943cleanup_transaction: 1950cleanup_transaction:
1944 btrfs_trans_release_metadata(trans, root); 1951 btrfs_trans_release_metadata(trans, root);
1945 trans->block_rsv = NULL; 1952 trans->block_rsv = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6ac037e9f9f0..b57b924e8e03 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,7 @@ struct btrfs_transaction {
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head ordered_operations; 58 struct list_head ordered_operations;
59 struct list_head pending_chunks; 59 struct list_head pending_chunks;
60 struct list_head switch_commits;
60 struct btrfs_delayed_ref_root delayed_refs; 61 struct btrfs_delayed_ref_root delayed_refs;
61 int aborted; 62 int aborted;
62}; 63};
@@ -78,6 +79,8 @@ struct btrfs_transaction {
78#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ 79#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
79 __TRANS_ATTACH) 80 __TRANS_ATTACH)
80 81
82#define BTRFS_SEND_TRANS_STUB 1
83
81struct btrfs_trans_handle { 84struct btrfs_trans_handle {
82 u64 transid; 85 u64 transid;
83 u64 bytes_reserved; 86 u64 bytes_reserved;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
136 * syncing the tree wait for us to finish 136 * syncing the tree wait for us to finish
137 */ 137 */
138static int start_log_trans(struct btrfs_trans_handle *trans, 138static int start_log_trans(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root) 139 struct btrfs_root *root,
140 struct btrfs_log_ctx *ctx)
140{ 141{
142 int index;
141 int ret; 143 int ret;
142 int err = 0;
143 144
144 mutex_lock(&root->log_mutex); 145 mutex_lock(&root->log_mutex);
145 if (root->log_root) { 146 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
148 trans->transid) {
149 ret = -EAGAIN;
150 goto out;
151 }
152
146 if (!root->log_start_pid) { 153 if (!root->log_start_pid) {
147 root->log_start_pid = current->pid; 154 root->log_start_pid = current->pid;
148 root->log_multiple_pids = false; 155 root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
152 159
153 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_batch);
154 atomic_inc(&root->log_writers); 161 atomic_inc(&root->log_writers);
162 if (ctx) {
163 index = root->log_transid % 2;
164 list_add_tail(&ctx->list, &root->log_ctxs[index]);
165 ctx->log_transid = root->log_transid;
166 }
155 mutex_unlock(&root->log_mutex); 167 mutex_unlock(&root->log_mutex);
156 return 0; 168 return 0;
157 } 169 }
158 root->log_multiple_pids = false; 170
159 root->log_start_pid = current->pid; 171 ret = 0;
160 mutex_lock(&root->fs_info->tree_log_mutex); 172 mutex_lock(&root->fs_info->tree_log_mutex);
161 if (!root->fs_info->log_root_tree) { 173 if (!root->fs_info->log_root_tree)
162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 ret = btrfs_init_log_root_tree(trans, root->fs_info);
163 if (ret) 175 mutex_unlock(&root->fs_info->tree_log_mutex);
164 err = ret; 176 if (ret)
165 } 177 goto out;
166 if (err == 0 && !root->log_root) { 178
179 if (!root->log_root) {
167 ret = btrfs_add_log_tree(trans, root); 180 ret = btrfs_add_log_tree(trans, root);
168 if (ret) 181 if (ret)
169 err = ret; 182 goto out;
170 } 183 }
171 mutex_unlock(&root->fs_info->tree_log_mutex); 184 root->log_multiple_pids = false;
185 root->log_start_pid = current->pid;
172 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_batch);
173 atomic_inc(&root->log_writers); 187 atomic_inc(&root->log_writers);
188 if (ctx) {
189 index = root->log_transid % 2;
190 list_add_tail(&ctx->list, &root->log_ctxs[index]);
191 ctx->log_transid = root->log_transid;
192 }
193out:
174 mutex_unlock(&root->log_mutex); 194 mutex_unlock(&root->log_mutex);
175 return err; 195 return ret;
176} 196}
177 197
178/* 198/*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
2359 return ret; 2379 return ret;
2360} 2380}
2361 2381
2362static int wait_log_commit(struct btrfs_trans_handle *trans, 2382static void wait_log_commit(struct btrfs_trans_handle *trans,
2363 struct btrfs_root *root, unsigned long transid) 2383 struct btrfs_root *root, int transid)
2364{ 2384{
2365 DEFINE_WAIT(wait); 2385 DEFINE_WAIT(wait);
2366 int index = transid % 2; 2386 int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
2375 &wait, TASK_UNINTERRUPTIBLE); 2395 &wait, TASK_UNINTERRUPTIBLE);
2376 mutex_unlock(&root->log_mutex); 2396 mutex_unlock(&root->log_mutex);
2377 2397
2378 if (root->fs_info->last_trans_log_full_commit != 2398 if (root->log_transid_committed < transid &&
2379 trans->transid && root->log_transid < transid + 2 &&
2380 atomic_read(&root->log_commit[index])) 2399 atomic_read(&root->log_commit[index]))
2381 schedule(); 2400 schedule();
2382 2401
2383 finish_wait(&root->log_commit_wait[index], &wait); 2402 finish_wait(&root->log_commit_wait[index], &wait);
2384 mutex_lock(&root->log_mutex); 2403 mutex_lock(&root->log_mutex);
2385 } while (root->fs_info->last_trans_log_full_commit != 2404 } while (root->log_transid_committed < transid &&
2386 trans->transid && root->log_transid < transid + 2 &&
2387 atomic_read(&root->log_commit[index])); 2405 atomic_read(&root->log_commit[index]));
2388 return 0;
2389} 2406}
2390 2407
2391static void wait_for_writer(struct btrfs_trans_handle *trans, 2408static void wait_for_writer(struct btrfs_trans_handle *trans,
2392 struct btrfs_root *root) 2409 struct btrfs_root *root)
2393{ 2410{
2394 DEFINE_WAIT(wait); 2411 DEFINE_WAIT(wait);
2395 while (root->fs_info->last_trans_log_full_commit != 2412
2396 trans->transid && atomic_read(&root->log_writers)) { 2413 while (atomic_read(&root->log_writers)) {
2397 prepare_to_wait(&root->log_writer_wait, 2414 prepare_to_wait(&root->log_writer_wait,
2398 &wait, TASK_UNINTERRUPTIBLE); 2415 &wait, TASK_UNINTERRUPTIBLE);
2399 mutex_unlock(&root->log_mutex); 2416 mutex_unlock(&root->log_mutex);
2400 if (root->fs_info->last_trans_log_full_commit != 2417 if (atomic_read(&root->log_writers))
2401 trans->transid && atomic_read(&root->log_writers))
2402 schedule(); 2418 schedule();
2403 mutex_lock(&root->log_mutex); 2419 mutex_lock(&root->log_mutex);
2404 finish_wait(&root->log_writer_wait, &wait); 2420 finish_wait(&root->log_writer_wait, &wait);
2405 } 2421 }
2406} 2422}
2407 2423
2424static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2425 struct btrfs_log_ctx *ctx)
2426{
2427 if (!ctx)
2428 return;
2429
2430 mutex_lock(&root->log_mutex);
2431 list_del_init(&ctx->list);
2432 mutex_unlock(&root->log_mutex);
2433}
2434
2435/*
2436 * Invoked in log mutex context, or be sure there is no other task which
2437 * can access the list.
2438 */
2439static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2440 int index, int error)
2441{
2442 struct btrfs_log_ctx *ctx;
2443
2444 if (!error) {
2445 INIT_LIST_HEAD(&root->log_ctxs[index]);
2446 return;
2447 }
2448
2449 list_for_each_entry(ctx, &root->log_ctxs[index], list)
2450 ctx->log_ret = error;
2451
2452 INIT_LIST_HEAD(&root->log_ctxs[index]);
2453}
2454
2408/* 2455/*
2409 * btrfs_sync_log does sends a given tree log down to the disk and 2456 * btrfs_sync_log does sends a given tree log down to the disk and
2410 * updates the super blocks to record it. When this call is done, 2457 * updates the super blocks to record it. When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
2418 * that has happened. 2465 * that has happened.
2419 */ 2466 */
2420int btrfs_sync_log(struct btrfs_trans_handle *trans, 2467int btrfs_sync_log(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root) 2468 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2422{ 2469{
2423 int index1; 2470 int index1;
2424 int index2; 2471 int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2426 int ret; 2473 int ret;
2427 struct btrfs_root *log = root->log_root; 2474 struct btrfs_root *log = root->log_root;
2428 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2475 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2429 unsigned long log_transid = 0; 2476 int log_transid = 0;
2477 struct btrfs_log_ctx root_log_ctx;
2430 struct blk_plug plug; 2478 struct blk_plug plug;
2431 2479
2432 mutex_lock(&root->log_mutex); 2480 mutex_lock(&root->log_mutex);
2433 log_transid = root->log_transid; 2481 log_transid = ctx->log_transid;
2434 index1 = root->log_transid % 2; 2482 if (root->log_transid_committed >= log_transid) {
2483 mutex_unlock(&root->log_mutex);
2484 return ctx->log_ret;
2485 }
2486
2487 index1 = log_transid % 2;
2435 if (atomic_read(&root->log_commit[index1])) { 2488 if (atomic_read(&root->log_commit[index1])) {
2436 wait_log_commit(trans, root, root->log_transid); 2489 wait_log_commit(trans, root, log_transid);
2437 mutex_unlock(&root->log_mutex); 2490 mutex_unlock(&root->log_mutex);
2438 return 0; 2491 return ctx->log_ret;
2439 } 2492 }
2493 ASSERT(log_transid == root->log_transid);
2440 atomic_set(&root->log_commit[index1], 1); 2494 atomic_set(&root->log_commit[index1], 1);
2441 2495
2442 /* wait for previous tree log sync to complete */ 2496 /* wait for previous tree log sync to complete */
2443 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2497 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2444 wait_log_commit(trans, root, root->log_transid - 1); 2498 wait_log_commit(trans, root, log_transid - 1);
2499
2445 while (1) { 2500 while (1) {
2446 int batch = atomic_read(&root->log_batch); 2501 int batch = atomic_read(&root->log_batch);
2447 /* when we're on an ssd, just kick the log commit out */ 2502 /* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2456 } 2511 }
2457 2512
2458 /* bail out if we need to do a full commit */ 2513 /* bail out if we need to do a full commit */
2459 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2515 trans->transid) {
2460 ret = -EAGAIN; 2516 ret = -EAGAIN;
2461 btrfs_free_logged_extents(log, log_transid); 2517 btrfs_free_logged_extents(log, log_transid);
2462 mutex_unlock(&root->log_mutex); 2518 mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2477 blk_finish_plug(&plug); 2533 blk_finish_plug(&plug);
2478 btrfs_abort_transaction(trans, root, ret); 2534 btrfs_abort_transaction(trans, root, ret);
2479 btrfs_free_logged_extents(log, log_transid); 2535 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2537 trans->transid;
2480 mutex_unlock(&root->log_mutex); 2538 mutex_unlock(&root->log_mutex);
2481 goto out; 2539 goto out;
2482 } 2540 }
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2486 root->log_transid++; 2544 root->log_transid++;
2487 log->log_transid = root->log_transid; 2545 log->log_transid = root->log_transid;
2488 root->log_start_pid = 0; 2546 root->log_start_pid = 0;
2489 smp_mb();
2490 /* 2547 /*
2491 * IO has been started, blocks of the log tree have WRITTEN flag set 2548 * IO has been started, blocks of the log tree have WRITTEN flag set
2492 * in their headers. new modifications of the log will be written to 2549 * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2494 */ 2551 */
2495 mutex_unlock(&root->log_mutex); 2552 mutex_unlock(&root->log_mutex);
2496 2553
2554 btrfs_init_log_ctx(&root_log_ctx);
2555
2497 mutex_lock(&log_root_tree->log_mutex); 2556 mutex_lock(&log_root_tree->log_mutex);
2498 atomic_inc(&log_root_tree->log_batch); 2557 atomic_inc(&log_root_tree->log_batch);
2499 atomic_inc(&log_root_tree->log_writers); 2558 atomic_inc(&log_root_tree->log_writers);
2559
2560 index2 = log_root_tree->log_transid % 2;
2561 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2562 root_log_ctx.log_transid = log_root_tree->log_transid;
2563
2500 mutex_unlock(&log_root_tree->log_mutex); 2564 mutex_unlock(&log_root_tree->log_mutex);
2501 2565
2502 ret = update_log_root(trans, log); 2566 ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2509 } 2573 }
2510 2574
2511 if (ret) { 2575 if (ret) {
2576 if (!list_empty(&root_log_ctx.list))
2577 list_del_init(&root_log_ctx.list);
2578
2512 blk_finish_plug(&plug); 2579 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2581 trans->transid;
2513 if (ret != -ENOSPC) { 2582 if (ret != -ENOSPC) {
2514 btrfs_abort_transaction(trans, root, ret); 2583 btrfs_abort_transaction(trans, root, ret);
2515 mutex_unlock(&log_root_tree->log_mutex); 2584 mutex_unlock(&log_root_tree->log_mutex);
2516 goto out; 2585 goto out;
2517 } 2586 }
2518 root->fs_info->last_trans_log_full_commit = trans->transid;
2519 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2587 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2520 btrfs_free_logged_extents(log, log_transid); 2588 btrfs_free_logged_extents(log, log_transid);
2521 mutex_unlock(&log_root_tree->log_mutex); 2589 mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2523 goto out; 2591 goto out;
2524 } 2592 }
2525 2593
2526 index2 = log_root_tree->log_transid % 2; 2594 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2595 mutex_unlock(&log_root_tree->log_mutex);
2596 ret = root_log_ctx.log_ret;
2597 goto out;
2598 }
2599
2600 index2 = root_log_ctx.log_transid % 2;
2527 if (atomic_read(&log_root_tree->log_commit[index2])) { 2601 if (atomic_read(&log_root_tree->log_commit[index2])) {
2528 blk_finish_plug(&plug); 2602 blk_finish_plug(&plug);
2529 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2603 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2530 wait_log_commit(trans, log_root_tree, 2604 wait_log_commit(trans, log_root_tree,
2531 log_root_tree->log_transid); 2605 root_log_ctx.log_transid);
2532 btrfs_free_logged_extents(log, log_transid); 2606 btrfs_free_logged_extents(log, log_transid);
2533 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2534 ret = 0; 2608 ret = root_log_ctx.log_ret;
2535 goto out; 2609 goto out;
2536 } 2610 }
2611 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2537 atomic_set(&log_root_tree->log_commit[index2], 1); 2612 atomic_set(&log_root_tree->log_commit[index2], 1);
2538 2613
2539 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2614 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2540 wait_log_commit(trans, log_root_tree, 2615 wait_log_commit(trans, log_root_tree,
2541 log_root_tree->log_transid - 1); 2616 root_log_ctx.log_transid - 1);
2542 } 2617 }
2543 2618
2544 wait_for_writer(trans, log_root_tree); 2619 wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2547 * now that we've moved on to the tree of log tree roots, 2622 * now that we've moved on to the tree of log tree roots,
2548 * check the full commit flag again 2623 * check the full commit flag again
2549 */ 2624 */
2550 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2626 trans->transid) {
2551 blk_finish_plug(&plug); 2627 blk_finish_plug(&plug);
2552 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2553 btrfs_free_logged_extents(log, log_transid); 2629 btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2561 EXTENT_DIRTY | EXTENT_NEW); 2637 EXTENT_DIRTY | EXTENT_NEW);
2562 blk_finish_plug(&plug); 2638 blk_finish_plug(&plug);
2563 if (ret) { 2639 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2641 trans->transid;
2564 btrfs_abort_transaction(trans, root, ret); 2642 btrfs_abort_transaction(trans, root, ret);
2565 btrfs_free_logged_extents(log, log_transid); 2643 btrfs_free_logged_extents(log, log_transid);
2566 mutex_unlock(&log_root_tree->log_mutex); 2644 mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2578 btrfs_header_level(log_root_tree->node)); 2656 btrfs_header_level(log_root_tree->node));
2579 2657
2580 log_root_tree->log_transid++; 2658 log_root_tree->log_transid++;
2581 smp_mb();
2582
2583 mutex_unlock(&log_root_tree->log_mutex); 2659 mutex_unlock(&log_root_tree->log_mutex);
2584 2660
2585 /* 2661 /*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2591 */ 2667 */
2592 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2593 if (ret) { 2669 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2671 trans->transid;
2594 btrfs_abort_transaction(trans, root, ret); 2672 btrfs_abort_transaction(trans, root, ret);
2595 goto out_wake_log_root; 2673 goto out_wake_log_root;
2596 } 2674 }
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2601 mutex_unlock(&root->log_mutex); 2679 mutex_unlock(&root->log_mutex);
2602 2680
2603out_wake_log_root: 2681out_wake_log_root:
2682 /*
2683 * We needn't get log_mutex here because we are sure all
2684 * the other tasks are blocked.
2685 */
2686 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2687
2688 mutex_lock(&log_root_tree->log_mutex);
2689 log_root_tree->log_transid_committed++;
2604 atomic_set(&log_root_tree->log_commit[index2], 0); 2690 atomic_set(&log_root_tree->log_commit[index2], 0);
2605 smp_mb(); 2691 mutex_unlock(&log_root_tree->log_mutex);
2692
2606 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2693 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2607 wake_up(&log_root_tree->log_commit_wait[index2]); 2694 wake_up(&log_root_tree->log_commit_wait[index2]);
2608out: 2695out:
2696 /* See above. */
2697 btrfs_remove_all_log_ctxs(root, index1, ret);
2698
2699 mutex_lock(&root->log_mutex);
2700 root->log_transid_committed++;
2609 atomic_set(&root->log_commit[index1], 0); 2701 atomic_set(&root->log_commit[index1], 0);
2610 smp_mb(); 2702 mutex_unlock(&root->log_mutex);
2703
2611 if (waitqueue_active(&root->log_commit_wait[index1])) 2704 if (waitqueue_active(&root->log_commit_wait[index1]))
2612 wake_up(&root->log_commit_wait[index1]); 2705 wake_up(&root->log_commit_wait[index1]);
2613 return ret; 2706 return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3479 3572
3480static int log_one_extent(struct btrfs_trans_handle *trans, 3573static int log_one_extent(struct btrfs_trans_handle *trans,
3481 struct inode *inode, struct btrfs_root *root, 3574 struct inode *inode, struct btrfs_root *root,
3482 struct extent_map *em, struct btrfs_path *path) 3575 struct extent_map *em, struct btrfs_path *path,
3576 struct list_head *logged_list)
3483{ 3577{
3484 struct btrfs_root *log = root->log_root; 3578 struct btrfs_root *log = root->log_root;
3485 struct btrfs_file_extent_item *fi; 3579 struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3495 u64 extent_offset = em->start - em->orig_start; 3589 u64 extent_offset = em->start - em->orig_start;
3496 u64 block_len; 3590 u64 block_len;
3497 int ret; 3591 int ret;
3498 int index = log->log_transid % 2;
3499 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3592 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3500 int extent_inserted = 0; 3593 int extent_inserted = 0;
3501 3594
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3579 * First check and see if our csums are on our outstanding ordered 3672 * First check and see if our csums are on our outstanding ordered
3580 * extents. 3673 * extents.
3581 */ 3674 */
3582again: 3675 list_for_each_entry(ordered, logged_list, log_list) {
3583 spin_lock_irq(&log->log_extents_lock[index]);
3584 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3585 struct btrfs_ordered_sum *sum; 3676 struct btrfs_ordered_sum *sum;
3586 3677
3587 if (!mod_len) 3678 if (!mod_len)
3588 break; 3679 break;
3589 3680
3590 if (ordered->inode != inode)
3591 continue;
3592
3593 if (ordered->file_offset + ordered->len <= mod_start || 3681 if (ordered->file_offset + ordered->len <= mod_start ||
3594 mod_start + mod_len <= ordered->file_offset) 3682 mod_start + mod_len <= ordered->file_offset)
3595 continue; 3683 continue;
@@ -3632,12 +3720,6 @@ again:
3632 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3720 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3633 &ordered->flags)) 3721 &ordered->flags))
3634 continue; 3722 continue;
3635 atomic_inc(&ordered->refs);
3636 spin_unlock_irq(&log->log_extents_lock[index]);
3637 /*
3638 * we've dropped the lock, we must either break or
3639 * start over after this.
3640 */
3641 3723
3642 if (ordered->csum_bytes_left) { 3724 if (ordered->csum_bytes_left) {
3643 btrfs_start_ordered_extent(inode, ordered, 0); 3725 btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
3647 3729
3648 list_for_each_entry(sum, &ordered->list, list) { 3730 list_for_each_entry(sum, &ordered->list, list) {
3649 ret = btrfs_csum_file_blocks(trans, log, sum); 3731 ret = btrfs_csum_file_blocks(trans, log, sum);
3650 if (ret) { 3732 if (ret)
3651 btrfs_put_ordered_extent(ordered);
3652 goto unlocked; 3733 goto unlocked;
3653 }
3654 } 3734 }
3655 btrfs_put_ordered_extent(ordered);
3656 goto again;
3657 3735
3658 } 3736 }
3659 spin_unlock_irq(&log->log_extents_lock[index]);
3660unlocked: 3737unlocked:
3661 3738
3662 if (!mod_len || ret) 3739 if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
3694static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3771static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3695 struct btrfs_root *root, 3772 struct btrfs_root *root,
3696 struct inode *inode, 3773 struct inode *inode,
3697 struct btrfs_path *path) 3774 struct btrfs_path *path,
3775 struct list_head *logged_list)
3698{ 3776{
3699 struct extent_map *em, *n; 3777 struct extent_map *em, *n;
3700 struct list_head extents; 3778 struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
3752 3830
3753 write_unlock(&tree->lock); 3831 write_unlock(&tree->lock);
3754 3832
3755 ret = log_one_extent(trans, inode, root, em, path); 3833 ret = log_one_extent(trans, inode, root, em, path, logged_list);
3756 write_lock(&tree->lock); 3834 write_lock(&tree->lock);
3757 clear_em_logging(tree, em); 3835 clear_em_logging(tree, em);
3758 free_extent_map(em); 3836 free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3788 struct btrfs_key max_key; 3866 struct btrfs_key max_key;
3789 struct btrfs_root *log = root->log_root; 3867 struct btrfs_root *log = root->log_root;
3790 struct extent_buffer *src = NULL; 3868 struct extent_buffer *src = NULL;
3869 LIST_HEAD(logged_list);
3791 u64 last_extent = 0; 3870 u64 last_extent = 0;
3792 int err = 0; 3871 int err = 0;
3793 int ret; 3872 int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3836 3915
3837 mutex_lock(&BTRFS_I(inode)->log_mutex); 3916 mutex_lock(&BTRFS_I(inode)->log_mutex);
3838 3917
3839 btrfs_get_logged_extents(log, inode); 3918 btrfs_get_logged_extents(inode, &logged_list);
3840 3919
3841 /* 3920 /*
3842 * a brute force approach to making sure we get the most uptodate 3921 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
3962 btrfs_release_path(path); 4041 btrfs_release_path(path);
3963 btrfs_release_path(dst_path); 4042 btrfs_release_path(dst_path);
3964 if (fast_search) { 4043 if (fast_search) {
3965 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 4044 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4045 &logged_list);
3966 if (ret) { 4046 if (ret) {
3967 err = ret; 4047 err = ret;
3968 goto out_unlock; 4048 goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
3987 BTRFS_I(inode)->logged_trans = trans->transid; 4067 BTRFS_I(inode)->logged_trans = trans->transid;
3988 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4068 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3989out_unlock: 4069out_unlock:
3990 if (err) 4070 if (unlikely(err))
3991 btrfs_free_logged_extents(log, log->log_transid); 4071 btrfs_put_logged_extents(&logged_list);
4072 else
4073 btrfs_submit_logged_extents(&logged_list, log);
3992 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4074 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3993 4075
3994 btrfs_free_path(path); 4076 btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
4079 */ 4161 */
4080static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4162static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4081 struct btrfs_root *root, struct inode *inode, 4163 struct btrfs_root *root, struct inode *inode,
4082 struct dentry *parent, int exists_only) 4164 struct dentry *parent, int exists_only,
4165 struct btrfs_log_ctx *ctx)
4083{ 4166{
4084 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4167 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4085 struct super_block *sb; 4168 struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4116 goto end_no_trans; 4199 goto end_no_trans;
4117 } 4200 }
4118 4201
4119 ret = start_log_trans(trans, root); 4202 ret = start_log_trans(trans, root, ctx);
4120 if (ret) 4203 if (ret)
4121 goto end_trans; 4204 goto end_no_trans;
4122 4205
4123 ret = btrfs_log_inode(trans, root, inode, inode_only); 4206 ret = btrfs_log_inode(trans, root, inode, inode_only);
4124 if (ret) 4207 if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
4166 root->fs_info->last_trans_log_full_commit = trans->transid; 4249 root->fs_info->last_trans_log_full_commit = trans->transid;
4167 ret = 1; 4250 ret = 1;
4168 } 4251 }
4252
4253 if (ret)
4254 btrfs_remove_log_ctx(root, ctx);
4169 btrfs_end_log_trans(root); 4255 btrfs_end_log_trans(root);
4170end_no_trans: 4256end_no_trans:
4171 return ret; 4257 return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
4178 * data on disk. 4264 * data on disk.
4179 */ 4265 */
4180int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4266int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4181 struct btrfs_root *root, struct dentry *dentry) 4267 struct btrfs_root *root, struct dentry *dentry,
4268 struct btrfs_log_ctx *ctx)
4182{ 4269{
4183 struct dentry *parent = dget_parent(dentry); 4270 struct dentry *parent = dget_parent(dentry);
4184 int ret; 4271 int ret;
4185 4272
4186 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4273 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4274 0, ctx);
4187 dput(parent); 4275 dput(parent);
4188 4276
4189 return ret; 4277 return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4420 root->fs_info->last_trans_committed)) 4508 root->fs_info->last_trans_committed))
4421 return 0; 4509 return 0;
4422 4510
4423 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4511 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4424} 4512}
4425 4513
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 23#define BTRFS_NO_LOG_SYNC 256
24 24
25struct btrfs_log_ctx {
26 int log_ret;
27 int log_transid;
28 struct list_head list;
29};
30
31static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
32{
33 ctx->log_ret = 0;
34 ctx->log_transid = 0;
35 INIT_LIST_HEAD(&ctx->list);
36}
37
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 38int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 39 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 41int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info); 42 struct btrfs_fs_info *fs_info);
30int btrfs_recover_log_trees(struct btrfs_root *tree_root); 43int btrfs_recover_log_trees(struct btrfs_root *tree_root);
31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 44int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct dentry *dentry); 45 struct btrfs_root *root, struct dentry *dentry,
46 struct btrfs_log_ctx *ctx);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 47int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 48 struct btrfs_root *root,
35 const char *name, int name_len, 49 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..49d7fab73360 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
415 device->running_pending = 1; 415 device->running_pending = 1;
416 416
417 spin_unlock(&device->io_lock); 417 spin_unlock(&device->io_lock);
418 btrfs_requeue_work(&device->work); 418 btrfs_queue_work(fs_info->submit_workers,
419 &device->work);
419 goto done; 420 goto done;
420 } 421 }
421 /* unplug every 64 requests just for good measure */ 422 /* unplug every 64 requests just for good measure */
@@ -447,6 +448,14 @@ static void pending_bios_fn(struct btrfs_work *work)
447 run_scheduled_bios(device); 448 run_scheduled_bios(device);
448} 449}
449 450
451/*
452 * Add new device to list of registered devices
453 *
454 * Returns:
455 * 1 - first time device is seen
456 * 0 - device already known
457 * < 0 - error
458 */
450static noinline int device_list_add(const char *path, 459static noinline int device_list_add(const char *path,
451 struct btrfs_super_block *disk_super, 460 struct btrfs_super_block *disk_super,
452 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 461 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -454,6 +463,7 @@ static noinline int device_list_add(const char *path,
454 struct btrfs_device *device; 463 struct btrfs_device *device;
455 struct btrfs_fs_devices *fs_devices; 464 struct btrfs_fs_devices *fs_devices;
456 struct rcu_string *name; 465 struct rcu_string *name;
466 int ret = 0;
457 u64 found_transid = btrfs_super_generation(disk_super); 467 u64 found_transid = btrfs_super_generation(disk_super);
458 468
459 fs_devices = find_fsid(disk_super->fsid); 469 fs_devices = find_fsid(disk_super->fsid);
@@ -494,6 +504,7 @@ static noinline int device_list_add(const char *path,
494 fs_devices->num_devices++; 504 fs_devices->num_devices++;
495 mutex_unlock(&fs_devices->device_list_mutex); 505 mutex_unlock(&fs_devices->device_list_mutex);
496 506
507 ret = 1;
497 device->fs_devices = fs_devices; 508 device->fs_devices = fs_devices;
498 } else if (!device->name || strcmp(device->name->str, path)) { 509 } else if (!device->name || strcmp(device->name->str, path)) {
499 name = rcu_string_strdup(path, GFP_NOFS); 510 name = rcu_string_strdup(path, GFP_NOFS);
@@ -512,7 +523,8 @@ static noinline int device_list_add(const char *path,
512 fs_devices->latest_trans = found_transid; 523 fs_devices->latest_trans = found_transid;
513 } 524 }
514 *fs_devices_ret = fs_devices; 525 *fs_devices_ret = fs_devices;
515 return 0; 526
527 return ret;
516} 528}
517 529
518static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 530static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -909,17 +921,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
909 transid = btrfs_super_generation(disk_super); 921 transid = btrfs_super_generation(disk_super);
910 total_devices = btrfs_super_num_devices(disk_super); 922 total_devices = btrfs_super_num_devices(disk_super);
911 923
912 if (disk_super->label[0]) {
913 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
914 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
915 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
916 } else {
917 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
918 }
919
920 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
921
922 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 924 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
925 if (ret > 0) {
926 if (disk_super->label[0]) {
927 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
928 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
929 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
930 } else {
931 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
932 }
933
934 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
935 ret = 0;
936 }
923 if (!ret && fs_devices_ret) 937 if (!ret && fs_devices_ret)
924 (*fs_devices_ret)->total_devices = total_devices; 938 (*fs_devices_ret)->total_devices = total_devices;
925 939
@@ -5263,6 +5277,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5263static void btrfs_end_bio(struct bio *bio, int err) 5277static void btrfs_end_bio(struct bio *bio, int err)
5264{ 5278{
5265 struct btrfs_bio *bbio = bio->bi_private; 5279 struct btrfs_bio *bbio = bio->bi_private;
5280 struct btrfs_device *dev = bbio->stripes[0].dev;
5266 int is_orig_bio = 0; 5281 int is_orig_bio = 0;
5267 5282
5268 if (err) { 5283 if (err) {
@@ -5270,7 +5285,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5270 if (err == -EIO || err == -EREMOTEIO) { 5285 if (err == -EIO || err == -EREMOTEIO) {
5271 unsigned int stripe_index = 5286 unsigned int stripe_index =
5272 btrfs_io_bio(bio)->stripe_index; 5287 btrfs_io_bio(bio)->stripe_index;
5273 struct btrfs_device *dev;
5274 5288
5275 BUG_ON(stripe_index >= bbio->num_stripes); 5289 BUG_ON(stripe_index >= bbio->num_stripes);
5276 dev = bbio->stripes[stripe_index].dev; 5290 dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5306,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5292 if (bio == bbio->orig_bio) 5306 if (bio == bbio->orig_bio)
5293 is_orig_bio = 1; 5307 is_orig_bio = 1;
5294 5308
5309 btrfs_bio_counter_dec(bbio->fs_info);
5310
5295 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5311 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5296 if (!is_orig_bio) { 5312 if (!is_orig_bio) {
5297 bio_put(bio); 5313 bio_put(bio);
@@ -5328,13 +5344,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5328 } 5344 }
5329} 5345}
5330 5346
5331struct async_sched {
5332 struct bio *bio;
5333 int rw;
5334 struct btrfs_fs_info *info;
5335 struct btrfs_work work;
5336};
5337
5338/* 5347/*
5339 * see run_scheduled_bios for a description of why bios are collected for 5348 * see run_scheduled_bios for a description of why bios are collected for
5340 * async submit. 5349 * async submit.
@@ -5391,8 +5400,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5391 spin_unlock(&device->io_lock); 5400 spin_unlock(&device->io_lock);
5392 5401
5393 if (should_queue) 5402 if (should_queue)
5394 btrfs_queue_worker(&root->fs_info->submit_workers, 5403 btrfs_queue_work(root->fs_info->submit_workers,
5395 &device->work); 5404 &device->work);
5396} 5405}
5397 5406
5398static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5407static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5456,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5447 } 5456 }
5448#endif 5457#endif
5449 bio->bi_bdev = dev->bdev; 5458 bio->bi_bdev = dev->bdev;
5459
5460 btrfs_bio_counter_inc_noblocked(root->fs_info);
5461
5450 if (async) 5462 if (async)
5451 btrfs_schedule_bio(root, dev, rw, bio); 5463 btrfs_schedule_bio(root, dev, rw, bio);
5452 else 5464 else
@@ -5515,28 +5527,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5515 length = bio->bi_iter.bi_size; 5527 length = bio->bi_iter.bi_size;
5516 map_length = length; 5528 map_length = length;
5517 5529
5530 btrfs_bio_counter_inc_blocked(root->fs_info);
5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5531 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5519 mirror_num, &raid_map); 5532 mirror_num, &raid_map);
5520 if (ret) /* -ENOMEM */ 5533 if (ret) {
5534 btrfs_bio_counter_dec(root->fs_info);
5521 return ret; 5535 return ret;
5536 }
5522 5537
5523 total_devs = bbio->num_stripes; 5538 total_devs = bbio->num_stripes;
5524 bbio->orig_bio = first_bio; 5539 bbio->orig_bio = first_bio;
5525 bbio->private = first_bio->bi_private; 5540 bbio->private = first_bio->bi_private;
5526 bbio->end_io = first_bio->bi_end_io; 5541 bbio->end_io = first_bio->bi_end_io;
5542 bbio->fs_info = root->fs_info;
5527 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5543 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5528 5544
5529 if (raid_map) { 5545 if (raid_map) {
5530 /* In this case, map_length has been set to the length of 5546 /* In this case, map_length has been set to the length of
5531 a single stripe; not the whole write */ 5547 a single stripe; not the whole write */
5532 if (rw & WRITE) { 5548 if (rw & WRITE) {
5533 return raid56_parity_write(root, bio, bbio, 5549 ret = raid56_parity_write(root, bio, bbio,
5534 raid_map, map_length); 5550 raid_map, map_length);
5535 } else { 5551 } else {
5536 return raid56_parity_recover(root, bio, bbio, 5552 ret = raid56_parity_recover(root, bio, bbio,
5537 raid_map, map_length, 5553 raid_map, map_length,
5538 mirror_num); 5554 mirror_num);
5539 } 5555 }
5556 /*
5557 * FIXME, replace dosen't support raid56 yet, please fix
5558 * it in the future.
5559 */
5560 btrfs_bio_counter_dec(root->fs_info);
5561 return ret;
5540 } 5562 }
5541 5563
5542 if (map_length < length) { 5564 if (map_length < length) {
@@ -5578,6 +5600,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5578 async_submit); 5600 async_submit);
5579 dev_nr++; 5601 dev_nr++;
5580 } 5602 }
5603 btrfs_bio_counter_dec(root->fs_info);
5581 return 0; 5604 return 0;
5582} 5605}
5583 5606
@@ -5666,7 +5689,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5666 else 5689 else
5667 generate_random_uuid(dev->uuid); 5690 generate_random_uuid(dev->uuid);
5668 5691
5669 dev->work.func = pending_bios_fn; 5692 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
5670 5693
5671 return dev; 5694 return dev;
5672} 5695}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193struct btrfs_bio { 193struct btrfs_bio {
194 atomic_t stripes_pending; 194 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info;
195 bio_end_io_t *end_io; 196 bio_end_io_t *end_io;
196 struct bio *orig_bio; 197 struct bio *orig_bio;
197 void *private; 198 void *private;