aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-10-02 19:11:56 -0400
committerChris Mason <chris.mason@oracle.com>2009-10-05 09:44:45 -0400
commit61d92c328c16419fc96dc50dd16f8b8c695409ec (patch)
treee9cd82eb56ff5f38f64d9f35229d15496e5d53de
parentfbf190874407f23d2891b53ffdf7d3c6be8d47ff (diff)
Btrfs: fix deadlock on async thread startup
The btrfs async worker threads are used for a wide variety of things, including processing bio end_io functions. This means that when the endio threads aren't running, the rest of the FS isn't able to do the final processing required to clear PageWriteback. The endio threads also try to exit as they become idle and start more as the work piles up. The problem is that starting more threads means kthreadd may need to allocate ram, and that allocation may wait until the global number of writeback pages on the system is below a certain limit. The result of that throttling is that end IO threads wait on kthreadd, who is waiting on IO to end, which will never happen. This commit fixes the deadlock by handing off thread startup to a dedicated thread. It also fixes a bug where the on-demand thread creation was creating far too many threads because it didn't take into account threads being started by other procs. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/async-thread.c81
-rw-r--r--fs/btrfs/async-thread.h10
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/disk-io.c42
-rw-r--r--fs/btrfs/relocation.c4
5 files changed, 106 insertions, 32 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 282ca085c2fb..c0861e781cdb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,51 @@ struct btrfs_worker_thread {
64}; 64};
65 65
66/* 66/*
67 * btrfs_start_workers uses kthread_run, which can block waiting for memory
68 * for a very long time. It will actually throttle on page writeback,
69 * and so it may not make progress until after our btrfs worker threads
70 * process all of the pending work structs in their queue
71 *
72 * This means we can't use btrfs_start_workers from inside a btrfs worker
73 * thread that is used as part of cleaning dirty memory, which pretty much
74 * involves all of the worker threads.
75 *
76 * Instead we have a helper queue who never has more than one thread
77 * where we scheduler thread start operations. This worker_start struct
78 * is used to contain the work and hold a pointer to the queue that needs
79 * another worker.
80 */
81struct worker_start {
82 struct btrfs_work work;
83 struct btrfs_workers *queue;
84};
85
86static void start_new_worker_func(struct btrfs_work *work)
87{
88 struct worker_start *start;
89 start = container_of(work, struct worker_start, work);
90 btrfs_start_workers(start->queue, 1);
91 kfree(start);
92}
93
94static int start_new_worker(struct btrfs_workers *queue)
95{
96 struct worker_start *start;
97 int ret;
98
99 start = kzalloc(sizeof(*start), GFP_NOFS);
100 if (!start)
101 return -ENOMEM;
102
103 start->work.func = start_new_worker_func;
104 start->queue = queue;
105 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
106 if (ret)
107 kfree(start);
108 return ret;
109}
110
111/*
67 * helper function to move a thread onto the idle list after it 112 * helper function to move a thread onto the idle list after it
68 * has finished some requests. 113 * has finished some requests.
69 */ 114 */
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
118 goto out; 163 goto out;
119 164
120 workers->atomic_start_pending = 0; 165 workers->atomic_start_pending = 0;
121 if (workers->num_workers >= workers->max_workers) 166 if (workers->num_workers + workers->num_workers_starting >=
167 workers->max_workers)
122 goto out; 168 goto out;
123 169
170 workers->num_workers_starting += 1;
124 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
125 btrfs_start_workers(workers, 1); 172 start_new_worker(workers);
126 return; 173 return;
127 174
128out: 175out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
390/* 437/*
391 * simple init on struct btrfs_workers 438 * simple init on struct btrfs_workers
392 */ 439 */
393void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) 440void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
441 struct btrfs_workers *async_helper)
394{ 442{
395 workers->num_workers = 0; 443 workers->num_workers = 0;
444 workers->num_workers_starting = 0;
396 INIT_LIST_HEAD(&workers->worker_list); 445 INIT_LIST_HEAD(&workers->worker_list);
397 INIT_LIST_HEAD(&workers->idle_list); 446 INIT_LIST_HEAD(&workers->idle_list);
398 INIT_LIST_HEAD(&workers->order_list); 447 INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
404 workers->name = name; 453 workers->name = name;
405 workers->ordered = 0; 454 workers->ordered = 0;
406 workers->atomic_start_pending = 0; 455 workers->atomic_start_pending = 0;
407 workers->atomic_worker_start = 0; 456 workers->atomic_worker_start = async_helper;
408} 457}
409 458
410/* 459/*
411 * starts new worker threads. This does not enforce the max worker 460 * starts new worker threads. This does not enforce the max worker
412 * count in case you need to temporarily go past it. 461 * count in case you need to temporarily go past it.
413 */ 462 */
414int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 463static int __btrfs_start_workers(struct btrfs_workers *workers,
464 int num_workers)
415{ 465{
416 struct btrfs_worker_thread *worker; 466 struct btrfs_worker_thread *worker;
417 int ret = 0; 467 int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
444 list_add_tail(&worker->worker_list, &workers->idle_list); 494 list_add_tail(&worker->worker_list, &workers->idle_list);
445 worker->idle = 1; 495 worker->idle = 1;
446 workers->num_workers++; 496 workers->num_workers++;
497 workers->num_workers_starting--;
498 WARN_ON(workers->num_workers_starting < 0);
447 spin_unlock_irq(&workers->lock); 499 spin_unlock_irq(&workers->lock);
448 } 500 }
449 return 0; 501 return 0;
@@ -452,6 +504,14 @@ fail:
452 return ret; 504 return ret;
453} 505}
454 506
507int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
508{
509 spin_lock_irq(&workers->lock);
510 workers->num_workers_starting += num_workers;
511 spin_unlock_irq(&workers->lock);
512 return __btrfs_start_workers(workers, num_workers);
513}
514
455/* 515/*
456 * run through the list and find a worker thread that doesn't have a lot 516 * run through the list and find a worker thread that doesn't have a lot
457 * to do right now. This can return null if we aren't yet at the thread 517 * to do right now. This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
461{ 521{
462 struct btrfs_worker_thread *worker; 522 struct btrfs_worker_thread *worker;
463 struct list_head *next; 523 struct list_head *next;
464 int enforce_min = workers->num_workers < workers->max_workers; 524 int enforce_min;
525
526 enforce_min = (workers->num_workers + workers->num_workers_starting) <
527 workers->max_workers;
465 528
466 /* 529 /*
467 * if we find an idle thread, don't move it to the end of the 530 * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
509 worker = next_worker(workers); 572 worker = next_worker(workers);
510 573
511 if (!worker) { 574 if (!worker) {
512 if (workers->num_workers >= workers->max_workers) { 575 if (workers->num_workers + workers->num_workers_starting >=
576 workers->max_workers) {
513 goto fallback; 577 goto fallback;
514 } else if (workers->atomic_worker_start) { 578 } else if (workers->atomic_worker_start) {
515 workers->atomic_start_pending = 1; 579 workers->atomic_start_pending = 1;
516 goto fallback; 580 goto fallback;
517 } else { 581 } else {
582 workers->num_workers_starting++;
518 spin_unlock_irqrestore(&workers->lock, flags); 583 spin_unlock_irqrestore(&workers->lock, flags);
519 /* we're below the limit, start another worker */ 584 /* we're below the limit, start another worker */
520 btrfs_start_workers(workers, 1); 585 __btrfs_start_workers(workers, 1);
521 goto again; 586 goto again;
522 } 587 }
523 } 588 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc089b95ec14..5077746cf85e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
64 /* current number of running workers */ 64 /* current number of running workers */
65 int num_workers; 65 int num_workers;
66 66
67 int num_workers_starting;
68
67 /* max number of workers allowed. changed by btrfs_start_workers */ 69 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers; 70 int max_workers;
69 71
@@ -78,9 +80,10 @@ struct btrfs_workers {
78 80
79 /* 81 /*
80 * are we allowed to sleep while starting workers or are we required 82 * are we allowed to sleep while starting workers or are we required
81 * to start them at a later time? 83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
82 */ 85 */
83 int atomic_worker_start; 86 struct btrfs_workers *atomic_worker_start;
84 87
85 /* list with all the work threads. The workers on the idle thread 88 /* list with all the work threads. The workers on the idle thread
86 * may be actively servicing jobs, but they haven't yet hit the 89 * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
109int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
110int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
111int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
112void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter);
113int btrfs_requeue_work(struct btrfs_work *work); 117int btrfs_requeue_work(struct btrfs_work *work);
114void btrfs_set_work_high_prio(struct btrfs_work *work); 118void btrfs_set_work_high_prio(struct btrfs_work *work);
115#endif 119#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8184f2feb2f3..1b920ffc6a59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -907,6 +907,7 @@ struct btrfs_fs_info {
907 * A third pool does submit_bio to avoid deadlocking with the other 907 * A third pool does submit_bio to avoid deadlocking with the other
908 * two 908 * two
909 */ 909 */
910 struct btrfs_workers generic_worker;
910 struct btrfs_workers workers; 911 struct btrfs_workers workers;
911 struct btrfs_workers delalloc_workers; 912 struct btrfs_workers delalloc_workers;
912 struct btrfs_workers endio_workers; 913 struct btrfs_workers endio_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 69dce50aabd2..9903f042765d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1746,21 +1746,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1746 err = -EINVAL; 1746 err = -EINVAL;
1747 goto fail_iput; 1747 goto fail_iput;
1748 } 1748 }
1749printk("thread pool is %d\n", fs_info->thread_pool_size); 1749
1750 /* 1750 btrfs_init_workers(&fs_info->generic_worker,
1751 * we need to start all the end_io workers up front because the 1751 "genwork", 1, NULL);
1752 * queue work function gets called at interrupt time, and so it 1752
1753 * cannot dynamically grow.
1754 */
1755 btrfs_init_workers(&fs_info->workers, "worker", 1753 btrfs_init_workers(&fs_info->workers, "worker",
1756 fs_info->thread_pool_size); 1754 fs_info->thread_pool_size,
1755 &fs_info->generic_worker);
1757 1756
1758 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 1757 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1759 fs_info->thread_pool_size); 1758 fs_info->thread_pool_size,
1759 &fs_info->generic_worker);
1760 1760
1761 btrfs_init_workers(&fs_info->submit_workers, "submit", 1761 btrfs_init_workers(&fs_info->submit_workers, "submit",
1762 min_t(u64, fs_devices->num_devices, 1762 min_t(u64, fs_devices->num_devices,
1763 fs_info->thread_pool_size)); 1763 fs_info->thread_pool_size),
1764 &fs_info->generic_worker);
1764 1765
1765 /* a higher idle thresh on the submit workers makes it much more 1766 /* a higher idle thresh on the submit workers makes it much more
1766 * likely that bios will be send down in a sane order to the 1767 * likely that bios will be send down in a sane order to the
@@ -1774,15 +1775,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1774 fs_info->delalloc_workers.idle_thresh = 2; 1775 fs_info->delalloc_workers.idle_thresh = 2;
1775 fs_info->delalloc_workers.ordered = 1; 1776 fs_info->delalloc_workers.ordered = 1;
1776 1777
1777 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1778 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1779 &fs_info->generic_worker);
1778 btrfs_init_workers(&fs_info->endio_workers, "endio", 1780 btrfs_init_workers(&fs_info->endio_workers, "endio",
1779 fs_info->thread_pool_size); 1781 fs_info->thread_pool_size,
1782 &fs_info->generic_worker);
1780 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", 1783 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1781 fs_info->thread_pool_size); 1784 fs_info->thread_pool_size,
1785 &fs_info->generic_worker);
1782 btrfs_init_workers(&fs_info->endio_meta_write_workers, 1786 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1783 "endio-meta-write", fs_info->thread_pool_size); 1787 "endio-meta-write", fs_info->thread_pool_size,
1788 &fs_info->generic_worker);
1784 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1789 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1785 fs_info->thread_pool_size); 1790 fs_info->thread_pool_size,
1791 &fs_info->generic_worker);
1786 1792
1787 /* 1793 /*
1788 * endios are largely parallel and should have a very 1794 * endios are largely parallel and should have a very
@@ -1794,12 +1800,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1794 fs_info->endio_write_workers.idle_thresh = 2; 1800 fs_info->endio_write_workers.idle_thresh = 2;
1795 fs_info->endio_meta_write_workers.idle_thresh = 2; 1801 fs_info->endio_meta_write_workers.idle_thresh = 2;
1796 1802
1797 fs_info->endio_workers.atomic_worker_start = 1;
1798 fs_info->endio_meta_workers.atomic_worker_start = 1;
1799 fs_info->endio_write_workers.atomic_worker_start = 1;
1800 fs_info->endio_meta_write_workers.atomic_worker_start = 1;
1801
1802 btrfs_start_workers(&fs_info->workers, 1); 1803 btrfs_start_workers(&fs_info->workers, 1);
1804 btrfs_start_workers(&fs_info->generic_worker, 1);
1803 btrfs_start_workers(&fs_info->submit_workers, 1); 1805 btrfs_start_workers(&fs_info->submit_workers, 1);
1804 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1806 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1805 btrfs_start_workers(&fs_info->fixup_workers, 1); 1807 btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -2012,6 +2014,7 @@ fail_chunk_root:
2012 free_extent_buffer(chunk_root->node); 2014 free_extent_buffer(chunk_root->node);
2013 free_extent_buffer(chunk_root->commit_root); 2015 free_extent_buffer(chunk_root->commit_root);
2014fail_sb_buffer: 2016fail_sb_buffer:
2017 btrfs_stop_workers(&fs_info->generic_worker);
2015 btrfs_stop_workers(&fs_info->fixup_workers); 2018 btrfs_stop_workers(&fs_info->fixup_workers);
2016 btrfs_stop_workers(&fs_info->delalloc_workers); 2019 btrfs_stop_workers(&fs_info->delalloc_workers);
2017 btrfs_stop_workers(&fs_info->workers); 2020 btrfs_stop_workers(&fs_info->workers);
@@ -2437,6 +2440,7 @@ int close_ctree(struct btrfs_root *root)
2437 2440
2438 iput(fs_info->btree_inode); 2441 iput(fs_info->btree_inode);
2439 2442
2443 btrfs_stop_workers(&fs_info->generic_worker);
2440 btrfs_stop_workers(&fs_info->fixup_workers); 2444 btrfs_stop_workers(&fs_info->fixup_workers);
2441 btrfs_stop_workers(&fs_info->delalloc_workers); 2445 btrfs_stop_workers(&fs_info->delalloc_workers);
2442 btrfs_stop_workers(&fs_info->workers); 2446 btrfs_stop_workers(&fs_info->workers);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 361ad323faac..cfcc93c93a7b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3518 BUG_ON(!rc->block_group); 3518 BUG_ON(!rc->block_group);
3519 3519
3520 btrfs_init_workers(&rc->workers, "relocate", 3520 btrfs_init_workers(&rc->workers, "relocate",
3521 fs_info->thread_pool_size); 3521 fs_info->thread_pool_size, NULL);
3522 3522
3523 rc->extent_root = extent_root; 3523 rc->extent_root = extent_root;
3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 mapping_tree_init(&rc->reloc_root_tree); 3701 mapping_tree_init(&rc->reloc_root_tree);
3702 INIT_LIST_HEAD(&rc->reloc_roots); 3702 INIT_LIST_HEAD(&rc->reloc_roots);
3703 btrfs_init_workers(&rc->workers, "relocate", 3703 btrfs_init_workers(&rc->workers, "relocate",
3704 root->fs_info->thread_pool_size); 3704 root->fs_info->thread_pool_size, NULL);
3705 rc->extent_root = root->fs_info->extent_root; 3705 rc->extent_root = root->fs_info->extent_root;
3706 3706
3707 set_reloc_control(rc); 3707 set_reloc_control(rc);