Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2008-06-11 16:50:36 -0400
committer: Chris Mason <chris.mason@oracle.com> 2008-09-25 11:04:03 -0400
commit: 8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree: 982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/async-thread.c
parent: 43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)
1 files changed, 288 insertions, 0 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..2911b67bd6f7
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+        /* list of struct btrfs_work that are waiting for service */
+        struct list_head pending;
+        /* list of worker threads from struct btrfs_workers */
+        struct list_head worker_list;
+        /* kthread */
+        struct task_struct *task;
+        /* number of things on the pending list */
+        atomic_t num_pending;
+        /* protects the pending list. */
+        spinlock_t lock;
+        /* set to non-zero when this thread is already awake and kicking */
+        int working;
+};
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+        struct btrfs_worker_thread *worker = arg;
+        struct list_head *cur;
+        struct btrfs_work *work;
+        do {
+                spin_lock_irq(&worker->lock);
+                while(!list_empty(&worker->pending)) {
+                        cur = worker->pending.next;
+                        work = list_entry(cur, struct btrfs_work, list);
+                        list_del(&work->list);
+                        clear_bit(0, &work->flags);
+                        work->worker = worker;
+                        spin_unlock_irq(&worker->lock);
+                        work->func(work);
+                        atomic_dec(&worker->num_pending);
+                        spin_lock_irq(&worker->lock);
+                }
+                worker->working = 0;
+                if (freezing(current)) {
+                        refrigerator();
+                } else {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        spin_unlock_irq(&worker->lock);
+                        schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+        struct list_head *cur;
+        struct btrfs_worker_thread *worker;
+        while(!list_empty(&workers->worker_list)) {
+                cur = workers->worker_list.next;
+                worker = list_entry(cur, struct btrfs_worker_thread,
+                                    worker_list);
+                kthread_stop(worker->task);
+                list_del(&worker->worker_list);
+                kfree(worker);
+        }
+        return 0;
+}
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, int max)
+{
+        workers->num_workers = 0;
+        INIT_LIST_HEAD(&workers->worker_list);
+        workers->last = NULL;
+        spin_lock_init(&workers->lock);
+        workers->max_workers = max;
+}
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+        struct btrfs_worker_thread *worker;
+        int ret = 0;
+        int i;
+        for (i = 0; i < num_workers; i++) {
+                worker = kzalloc(sizeof(*worker), GFP_NOFS);
+                if (!worker) {
+                        ret = -ENOMEM;
+                        goto fail;
+                }
+                INIT_LIST_HEAD(&worker->pending);
+                INIT_LIST_HEAD(&worker->worker_list);
+                spin_lock_init(&worker->lock);
+                atomic_set(&worker->num_pending, 0);
+                worker->task = kthread_run(worker_loop, worker, "btrfs");
+                if (IS_ERR(worker->task)) {
+                        ret = PTR_ERR(worker->task);
+                        goto fail;
+                }
+                spin_lock_irq(&workers->lock);
+                list_add_tail(&worker->worker_list, &workers->worker_list);
+                workers->last = worker;
+                workers->num_workers++;
+                spin_unlock_irq(&workers->lock);
+        }
+        return 0;
+fail:
+        btrfs_stop_workers(workers);
+        return ret;
+}
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+        struct btrfs_worker_thread *worker;
+        struct list_head *next;
+        struct list_head *start;
+        int enforce_min = workers->num_workers < workers->max_workers;
+        /* start with the last thread if it isn't busy */
+        worker = workers->last;
+        if (atomic_read(&worker->num_pending) < 64)
+                goto done;
+        next = worker->worker_list.next;
+        start = &worker->worker_list;
+        /*
+         * check all the workers for someone that is bored.  FIXME, do
+         * something smart here
+         */
+        while(next != start) {
+                if (next == &workers->worker_list) {
+                        next = workers->worker_list.next;
+                        continue;
+                }
+                worker = list_entry(next, struct btrfs_worker_thread,
+                                    worker_list);
+                if (atomic_read(&worker->num_pending) < 64 || !enforce_min)
+                        goto done;
+                next = next->next;
+        }
+        /*
+         * nobody was bored, if we're already at the max thread count,
+         * use the last thread
+         */
+        if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) {
+                return workers->last;
+        }
+        return NULL;
+done:
+        workers->last = worker;
+        return worker;
+}
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+        struct btrfs_worker_thread *worker;
+        unsigned long flags;
+again:
+        spin_lock_irqsave(&workers->lock, flags);
+        worker = next_worker(workers);
+        spin_unlock_irqrestore(&workers->lock, flags);
+        if (!worker) {
+                spin_lock_irqsave(&workers->lock, flags);
+                if (workers->num_workers >= workers->max_workers) {
+                        /*
+                         * we have failed to find any workers, just
+                         * return the force one
+                         */
+                        worker = list_entry(workers->worker_list.next,
+                                  struct btrfs_worker_thread, worker_list);
+                        spin_unlock_irqrestore(&workers->lock, flags);
+                } else {
+                        spin_unlock_irqrestore(&workers->lock, flags);
+                        /* we're below the limit, start another worker */
+                        btrfs_start_workers(workers, 1);
+                        goto again;
+                }
+        }
+        return worker;
+}
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+        struct btrfs_worker_thread *worker = work->worker;
+        unsigned long flags;
+        if (test_and_set_bit(0, &work->flags))
+                goto out;
+        spin_lock_irqsave(&worker->lock, flags);
+        atomic_inc(&worker->num_pending);
+        list_add_tail(&work->list, &worker->pending);
+        spin_unlock_irqrestore(&worker->lock, flags);
+out:
+        return 0;
+}
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+        struct btrfs_worker_thread *worker;
+        unsigned long flags;
+        int wake = 0;
+        /* don't requeue something already on a list */
+        if (test_and_set_bit(0, &work->flags))
+                goto out;
+        worker = find_worker(workers);
+        spin_lock_irqsave(&worker->lock, flags);
+        atomic_inc(&worker->num_pending);
+        list_add_tail(&work->list, &worker->pending);
+        /*
+         * avoid calling into wake_up_process if this thread has already
+         * been kicked
+         */
+        if (!worker->working)
+                wake = 1;
+        worker->working = 1;
+        spin_unlock_irqrestore(&worker->lock, flags);
+        if (wake)
+                wake_up_process(worker->task);
+out:
+        return 0;
+}
author	Chris Mason <chris.mason@oracle.com>	2008-06-11 16:50:36 -0400
committer	Chris Mason <chris.mason@oracle.com>	2008-09-25 11:04:03 -0400
commit	8b7128429235d9bd72cfd5ed20c77c4f3118f744 (patch)
tree	982eda13094af1ccd46e8c3853559a0eb6e298f6 /fs/btrfs/async-thread.c
parent	43e570b08a6c6b1d75f218566a6240542a386fd9 (diff)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 000000000000..2911b67bd6f7 --- /dev/null +++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,288 @@
	1	/*
	2	* Copyright (C) 2007 Oracle. All rights reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public
	6	* License v2 as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope that it will be useful,
	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	11	* General Public License for more details.
	12	*
	13	* You should have received a copy of the GNU General Public
	14	* License along with this program; if not, write to the
	15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	16	* Boston, MA 021110-1307, USA.
	17	*/
	18
	19	#include <linux/kthread.h>
	20	#include <linux/list.h>
	21	#include <linux/spinlock.h>
	22	#include <linux/freezer.h>
	23	#include "async-thread.h"
	24
	25	/*
	26	* container for the kthread task pointer and the list of pending work
	27	* One of these is allocated per thread.
	28	*/
	29	struct btrfs_worker_thread {
	30	/* list of struct btrfs_work that are waiting for service */
	31	struct list_head pending;
	32
	33	/* list of worker threads from struct btrfs_workers */
	34	struct list_head worker_list;
	35
	36	/* kthread */
	37	struct task_struct *task;
	38
	39	/* number of things on the pending list */
	40	atomic_t num_pending;
	41
	42	/* protects the pending list. */
	43	spinlock_t lock;
	44
	45	/* set to non-zero when this thread is already awake and kicking */
	46	int working;
	47	};
	48
	49	/*
	50	* main loop for servicing work items
	51	*/
	52	static int worker_loop(void *arg)
	53	{
	54	struct btrfs_worker_thread *worker = arg;
	55	struct list_head *cur;
	56	struct btrfs_work *work;
	57	do {
	58	spin_lock_irq(&worker->lock);
	59	while(!list_empty(&worker->pending)) {
	60	cur = worker->pending.next;
	61	work = list_entry(cur, struct btrfs_work, list);
	62	list_del(&work->list);
	63	clear_bit(0, &work->flags);
	64
	65	work->worker = worker;
	66	spin_unlock_irq(&worker->lock);
	67
	68	work->func(work);
	69
	70	atomic_dec(&worker->num_pending);
	71	spin_lock_irq(&worker->lock);
	72	}
	73	worker->working = 0;
	74	if (freezing(current)) {
	75	refrigerator();
	76	} else {
	77	set_current_state(TASK_INTERRUPTIBLE);
	78	spin_unlock_irq(&worker->lock);
	79	schedule();
	80	__set_current_state(TASK_RUNNING);
	81	}
	82	} while (!kthread_should_stop());
	83	return 0;
	84	}
	85
	86	/*
	87	* this will wait for all the worker threads to shutdown
	88	*/
	89	int btrfs_stop_workers(struct btrfs_workers *workers)
	90	{
	91	struct list_head *cur;
	92	struct btrfs_worker_thread *worker;
	93
	94	while(!list_empty(&workers->worker_list)) {
	95	cur = workers->worker_list.next;
	96	worker = list_entry(cur, struct btrfs_worker_thread,
	97	worker_list);
	98	kthread_stop(worker->task);
	99	list_del(&worker->worker_list);
	100	kfree(worker);
	101	}
	102	return 0;
	103	}
	104
	105	/*
	106	* simple init on struct btrfs_workers
	107	*/
	108	void btrfs_init_workers(struct btrfs_workers *workers, int max)
	109	{
	110	workers->num_workers = 0;
	111	INIT_LIST_HEAD(&workers->worker_list);
	112	workers->last = NULL;
	113	spin_lock_init(&workers->lock);
	114	workers->max_workers = max;
	115	}
	116
	117	/*
	118	* starts new worker threads. This does not enforce the max worker
	119	* count in case you need to temporarily go past it.
	120	*/
	121	int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
	122	{
	123	struct btrfs_worker_thread *worker;
	124	int ret = 0;
	125	int i;
	126
	127	for (i = 0; i < num_workers; i++) {
	128	worker = kzalloc(sizeof(*worker), GFP_NOFS);
	129	if (!worker) {
	130	ret = -ENOMEM;
	131	goto fail;
	132	}
	133
	134	INIT_LIST_HEAD(&worker->pending);
	135	INIT_LIST_HEAD(&worker->worker_list);
	136	spin_lock_init(&worker->lock);
	137	atomic_set(&worker->num_pending, 0);
	138	worker->task = kthread_run(worker_loop, worker, "btrfs");
	139	if (IS_ERR(worker->task)) {
	140	ret = PTR_ERR(worker->task);
	141	goto fail;
	142	}
	143
	144	spin_lock_irq(&workers->lock);
	145	list_add_tail(&worker->worker_list, &workers->worker_list);
	146	workers->last = worker;
	147	workers->num_workers++;
	148	spin_unlock_irq(&workers->lock);
	149	}
	150	return 0;
	151	fail:
	152	btrfs_stop_workers(workers);
	153	return ret;
	154	}
	155
	156	/*
	157	* run through the list and find a worker thread that doesn't have a lot
	158	* to do right now. This can return null if we aren't yet at the thread
	159	* count limit and all of the threads are busy.
	160	*/
	161	static struct btrfs_worker_thread next_worker(struct btrfs_workers workers)
	162	{
	163	struct btrfs_worker_thread *worker;
	164	struct list_head *next;
	165	struct list_head *start;
	166	int enforce_min = workers->num_workers < workers->max_workers;
	167
	168	/* start with the last thread if it isn't busy */
	169	worker = workers->last;
	170	if (atomic_read(&worker->num_pending) < 64)
	171	goto done;
	172
	173	next = worker->worker_list.next;
	174	start = &worker->worker_list;
	175
	176	/*
	177	* check all the workers for someone that is bored. FIXME, do
	178	* something smart here
	179	*/
	180	while(next != start) {
	181	if (next == &workers->worker_list) {
	182	next = workers->worker_list.next;
	183	continue;
	184	}
	185	worker = list_entry(next, struct btrfs_worker_thread,
	186	worker_list);
	187	if (atomic_read(&worker->num_pending) < 64 \|\| !enforce_min)
	188	goto done;
	189	next = next->next;
	190	}
	191	/*
	192	* nobody was bored, if we're already at the max thread count,
	193	* use the last thread
	194	*/
	195	if (!enforce_min \|\| atomic_read(&workers->last->num_pending) < 64) {
	196	return workers->last;
	197	}
	198	return NULL;
	199	done:
	200	workers->last = worker;
	201	return worker;
	202	}
	203
	204	static struct btrfs_worker_thread find_worker(struct btrfs_workers workers)
	205	{
	206	struct btrfs_worker_thread *worker;
	207	unsigned long flags;
	208
	209	again:
	210	spin_lock_irqsave(&workers->lock, flags);
	211	worker = next_worker(workers);
	212	spin_unlock_irqrestore(&workers->lock, flags);
	213
	214	if (!worker) {
	215	spin_lock_irqsave(&workers->lock, flags);
	216	if (workers->num_workers >= workers->max_workers) {
	217	/*
	218	* we have failed to find any workers, just
	219	* return the force one
	220	*/
	221	worker = list_entry(workers->worker_list.next,
	222	struct btrfs_worker_thread, worker_list);
	223	spin_unlock_irqrestore(&workers->lock, flags);
	224	} else {
	225	spin_unlock_irqrestore(&workers->lock, flags);
	226	/* we're below the limit, start another worker */
	227	btrfs_start_workers(workers, 1);
	228	goto again;
	229	}
	230	}
	231	return worker;
	232	}
	233
	234	/*
	235	* btrfs_requeue_work just puts the work item back on the tail of the list
	236	* it was taken from. It is intended for use with long running work functions
	237	* that make some progress and want to give the cpu up for others.
	238	*/
	239	int btrfs_requeue_work(struct btrfs_work *work)
	240	{
	241	struct btrfs_worker_thread *worker = work->worker;
	242	unsigned long flags;
	243
	244	if (test_and_set_bit(0, &work->flags))
	245	goto out;
	246
	247	spin_lock_irqsave(&worker->lock, flags);
	248	atomic_inc(&worker->num_pending);
	249	list_add_tail(&work->list, &worker->pending);
	250	spin_unlock_irqrestore(&worker->lock, flags);
	251	out:
	252	return 0;
	253	}
	254
	255	/*
	256	* places a struct btrfs_work into the pending queue of one of the kthreads
	257	*/
	258	int btrfs_queue_worker(struct btrfs_workers workers, struct btrfs_work work)
	259	{
	260	struct btrfs_worker_thread *worker;
	261	unsigned long flags;
	262	int wake = 0;
	263
	264	/* don't requeue something already on a list */
	265	if (test_and_set_bit(0, &work->flags))
	266	goto out;
	267
	268	worker = find_worker(workers);
	269
	270	spin_lock_irqsave(&worker->lock, flags);
	271	atomic_inc(&worker->num_pending);
	272	list_add_tail(&work->list, &worker->pending);
	273
	274	/*
	275	* avoid calling into wake_up_process if this thread has already
	276	* been kicked
	277	*/
	278	if (!worker->working)
	279	wake = 1;
	280	worker->working = 1;
	281
	282	spin_unlock_irqrestore(&worker->lock, flags);
	283
	284	if (wake)
	285	wake_up_process(worker->task);
	286	out:
	287	return 0;
	288	}