[PATCH] Update cfq io scheduler to time sliced design

This updates the CFQ io scheduler to the new time sliced design (cfq v3). It provides full process fairness, while giving excellent aggregate system throughput even for many competing processes. It supports io priorities, either inherited from the cpu nice value or set directly with the ioprio_get/set syscalls. The latter closely mimic set/getpriority. This import is based on my latest from -mm. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Jens Axboe <axboe@suse.de> 2005-06-27 04:55:12 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-06-27 17:33:29 -0400
commit: 22e2c507c301c3dbbcf91b4948b88f78842ee6c9 (patch)
tree: 9a97c91d1362e69703aa286021daffb8a5456f4c /fs
parent: 020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff)
3 files changed, 185 insertions, 0 deletions
diff --git a/fs/Makefile b/fs/Makefile
index fc92e59e9faf..20edcf28bfd2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,6 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
                ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
                seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
+                ioprio.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
 obj-$(CONFIG_COMPAT)            += compat.o
diff --git a/fs/ioprio.c b/fs/ioprio.c
new file mode 100644
index 000000000000..663e420636d6
--- /dev/null
+++ b/fs/ioprio.c
@@ -0,0 +1,172 @@
+/*
+ * fs/ioprio.c
+ *
+ * Copyright (C) 2004 Jens Axboe <axboe@suse.de>
+ *
+ * Helper functions for setting/querying io priorities of processes. The
+ * system calls closely mimmick getpriority/setpriority, see the man page for
+ * those. The prio argument is a composite of prio class and prio data, where
+ * the data argument has meaning within that class. The standard scheduling
+ * classes have 8 distinct prio levels, with 0 being the highest prio and 7
+ * being the lowest.
+ *
+ * IOW, setting BE scheduling class with prio 2 is done ala:
+ *
+ * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
+ *
+ * ioprio_set(PRIO_PROCESS, pid, prio);
+ *
+ * See also Documentation/block/ioprio.txt
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ioprio.h>
+#include <linux/blkdev.h>
+static int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+        struct io_context *ioc;
+        if (task->uid != current->euid &&
+            task->uid != current->uid && !capable(CAP_SYS_NICE))
+                return -EPERM;
+        task_lock(task);
+        task->ioprio = ioprio;
+        ioc = task->io_context;
+        if (ioc && ioc->set_ioprio)
+                ioc->set_ioprio(ioc, ioprio);
+        task_unlock(task);
+        return 0;
+}
+asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
+{
+        int class = IOPRIO_PRIO_CLASS(ioprio);
+        int data = IOPRIO_PRIO_DATA(ioprio);
+        struct task_struct *p, *g;
+        struct user_struct *user;
+        int ret;
+        switch (class) {
+                case IOPRIO_CLASS_RT:
+                        if (!capable(CAP_SYS_ADMIN))
+                                return -EPERM;
+                        /* fall through, rt has prio field too */
+                case IOPRIO_CLASS_BE:
+                        if (data >= IOPRIO_BE_NR || data < 0)
+                                return -EINVAL;
+                        break;
+                case IOPRIO_CLASS_IDLE:
+                        break;
+                default:
+                        return -EINVAL;
+        }
+        ret = -ESRCH;
+        read_lock_irq(&tasklist_lock);
+        switch (which) {
+                case IOPRIO_WHO_PROCESS:
+                        if (!who)
+                                p = current;
+                        else
+                                p = find_task_by_pid(who);
+                        if (p)
+                                ret = set_task_ioprio(p, ioprio);
+                        break;
+                case IOPRIO_WHO_PGRP:
+                        if (!who)
+                                who = process_group(current);
+                        do_each_task_pid(who, PIDTYPE_PGID, p) {
+                                ret = set_task_ioprio(p, ioprio);
+                                if (ret)
+                                        break;
+                        } while_each_task_pid(who, PIDTYPE_PGID, p);
+                        break;
+                case IOPRIO_WHO_USER:
+                        if (!who)
+                                user = current->user;
+                        else
+                                user = find_user(who);
+                        if (!user)
+                                break;
+                        do_each_thread(g, p) {
+                                if (p->uid != who)
+                                        continue;
+                                ret = set_task_ioprio(p, ioprio);
+                                if (ret)
+                                        break;
+                        } while_each_thread(g, p);
+                        if (who)
+                                free_uid(user);
+                        break;
+                default:
+                        ret = -EINVAL;
+        }
+        read_unlock_irq(&tasklist_lock);
+        return ret;
+}
+asmlinkage int sys_ioprio_get(int which, int who)
+{
+        struct task_struct *g, *p;
+        struct user_struct *user;
+        int ret = -ESRCH;
+        read_lock_irq(&tasklist_lock);
+        switch (which) {
+                case IOPRIO_WHO_PROCESS:
+                        if (!who)
+                                p = current;
+                        else
+                                p = find_task_by_pid(who);
+                        if (p)
+                                ret = p->ioprio;
+                        break;
+                case IOPRIO_WHO_PGRP:
+                        if (!who)
+                                who = process_group(current);
+                        do_each_task_pid(who, PIDTYPE_PGID, p) {
+                                if (ret == -ESRCH)
+                                        ret = p->ioprio;
+                                else
+                                        ret = ioprio_best(ret, p->ioprio);
+                        } while_each_task_pid(who, PIDTYPE_PGID, p);
+                        break;
+                case IOPRIO_WHO_USER:
+                        if (!who)
+                                user = current->user;
+                        else
+                                user = find_user(who);
+                        if (!user)
+                                break;
+                        do_each_thread(g, p) {
+                                if (p->uid != user->uid)
+                                        continue;
+                                if (ret == -ESRCH)
+                                        ret = p->ioprio;
+                                else
+                                        ret = ioprio_best(ret, p->ioprio);
+                        } while_each_thread(g, p);
+                        if (who)
+                                free_uid(user);
+                        break;
+                default:
+                        ret = -EINVAL;
+        }
+        read_unlock_irq(&tasklist_lock);
+        return ret;
+}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 7b87707acc36..d1bcf0da6728 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -645,18 +645,22 @@ struct buffer_chunk {
 static void write_chunk(struct buffer_chunk *chunk) {
    int i;
+    get_fs_excl();
    for (i = 0; i < chunk->nr ; i++) {
        submit_logged_buffer(chunk->bh[i]) ;
    }
    chunk->nr = 0;
+    put_fs_excl();
 }
 static void write_ordered_chunk(struct buffer_chunk *chunk) {
    int i;
+    get_fs_excl();
    for (i = 0; i < chunk->nr ; i++) {
        submit_ordered_buffer(chunk->bh[i]) ;
    }
    chunk->nr = 0;
+    put_fs_excl();
 }
 static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
@@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
    return 0 ;
  }
+  get_fs_excl();
  /* before we can put our commit blocks on disk, we have to make sure everyone older than
  ** us is on disk too
  */
@@ -1055,6 +1061,7 @@ put_jl:
  if (retval)
    reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
+  put_fs_excl();
  return retval;
 }
@@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s,
    return 0 ;
  }
+  get_fs_excl();
  /* if all the work is already done, get out of here */
  if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
      atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -1450,6 +1459,7 @@ flush_older_and_return:
  put_journal_list(s, jl);
  if (flushall)
    up(&journal->j_flush_sem);
+  put_fs_excl();
  return err ;
 } 
@@ -2719,6 +2729,7 @@ relock:
  th->t_trans_id = journal->j_trans_id ;
  unlock_journal(p_s_sb) ;
  INIT_LIST_HEAD (&th->t_list);
+  get_fs_excl();
  return 0 ;
 out_fail:
@@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
  BUG_ON (th->t_refcount > 1);
  BUG_ON (!th->t_trans_id);
+  put_fs_excl();
  current->journal_info = th->t_handle_save;
  reiserfs_check_lock_depth(p_s_sb, "journal end");
  if (journal->j_len == 0) {
author	Jens Axboe <axboe@suse.de>	2005-06-27 04:55:12 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-06-27 17:33:29 -0400
commit	22e2c507c301c3dbbcf91b4948b88f78842ee6c9 (patch)
tree	9a97c91d1362e69703aa286021daffb8a5456f4c /fs
parent	020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff)

diff --git a/fs/Makefile b/fs/Makefile index fc92e59e9faf..20edcf28bfd2 100644 --- a/fs/Makefile +++ b/fs/Makefile
@@ -10,6 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
10	ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \	10	ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
11	attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \	11	attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
12	seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \	12	seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
		13	ioprio.o
13		14
14	obj-$(CONFIG_EPOLL) += eventpoll.o	15	obj-$(CONFIG_EPOLL) += eventpoll.o
15	obj-$(CONFIG_COMPAT) += compat.o	16	obj-$(CONFIG_COMPAT) += compat.o


diff --git a/fs/ioprio.c b/fs/ioprio.c new file mode 100644 index 000000000000..663e420636d6 --- /dev/null +++ b/fs/ioprio.c
@@ -0,0 +1,172 @@
		1	/*
		2	* fs/ioprio.c
		3	*
		4	* Copyright (C) 2004 Jens Axboe <axboe@suse.de>
		5	*
		6	* Helper functions for setting/querying io priorities of processes. The
		7	* system calls closely mimmick getpriority/setpriority, see the man page for
		8	* those. The prio argument is a composite of prio class and prio data, where
		9	* the data argument has meaning within that class. The standard scheduling
		10	* classes have 8 distinct prio levels, with 0 being the highest prio and 7
		11	* being the lowest.
		12	*
		13	* IOW, setting BE scheduling class with prio 2 is done ala:
		14	*
		15	* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) \| 2;
		16	*
		17	* ioprio_set(PRIO_PROCESS, pid, prio);
		18	*
		19	* See also Documentation/block/ioprio.txt
		20	*
		21	*/
		22	#include <linux/kernel.h>
		23	#include <linux/ioprio.h>
		24	#include <linux/blkdev.h>
		25
		26	static int set_task_ioprio(struct task_struct *task, int ioprio)
		27	{
		28	struct io_context *ioc;
		29
		30	if (task->uid != current->euid &&
		31	task->uid != current->uid && !capable(CAP_SYS_NICE))
		32	return -EPERM;
		33
		34	task_lock(task);
		35
		36	task->ioprio = ioprio;
		37
		38	ioc = task->io_context;
		39	if (ioc && ioc->set_ioprio)
		40	ioc->set_ioprio(ioc, ioprio);
		41
		42	task_unlock(task);
		43	return 0;
		44	}
		45
		46	asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
		47	{
		48	int class = IOPRIO_PRIO_CLASS(ioprio);
		49	int data = IOPRIO_PRIO_DATA(ioprio);
		50	struct task_struct p, g;
		51	struct user_struct *user;
		52	int ret;
		53
		54	switch (class) {
		55	case IOPRIO_CLASS_RT:
		56	if (!capable(CAP_SYS_ADMIN))
		57	return -EPERM;
		58	/* fall through, rt has prio field too */
		59	case IOPRIO_CLASS_BE:
		60	if (data >= IOPRIO_BE_NR \|\| data < 0)
		61	return -EINVAL;
		62
		63	break;
		64	case IOPRIO_CLASS_IDLE:
		65	break;
		66	default:
		67	return -EINVAL;
		68	}
		69
		70	ret = -ESRCH;
		71	read_lock_irq(&tasklist_lock);
		72	switch (which) {
		73	case IOPRIO_WHO_PROCESS:
		74	if (!who)
		75	p = current;
		76	else
		77	p = find_task_by_pid(who);
		78	if (p)
		79	ret = set_task_ioprio(p, ioprio);
		80	break;
		81	case IOPRIO_WHO_PGRP:
		82	if (!who)
		83	who = process_group(current);
		84	do_each_task_pid(who, PIDTYPE_PGID, p) {
		85	ret = set_task_ioprio(p, ioprio);
		86	if (ret)
		87	break;
		88	} while_each_task_pid(who, PIDTYPE_PGID, p);
		89	break;
		90	case IOPRIO_WHO_USER:
		91	if (!who)
		92	user = current->user;
		93	else
		94	user = find_user(who);
		95
		96	if (!user)
		97	break;
		98
		99	do_each_thread(g, p) {
		100	if (p->uid != who)
		101	continue;
		102	ret = set_task_ioprio(p, ioprio);
		103	if (ret)
		104	break;
		105	} while_each_thread(g, p);
		106
		107	if (who)
		108	free_uid(user);
		109	break;
		110	default:
		111	ret = -EINVAL;
		112	}
		113
		114	read_unlock_irq(&tasklist_lock);
		115	return ret;
		116	}
		117
		118	asmlinkage int sys_ioprio_get(int which, int who)
		119	{
		120	struct task_struct g, p;
		121	struct user_struct *user;
		122	int ret = -ESRCH;
		123
		124	read_lock_irq(&tasklist_lock);
		125	switch (which) {
		126	case IOPRIO_WHO_PROCESS:
		127	if (!who)
		128	p = current;
		129	else
		130	p = find_task_by_pid(who);
		131	if (p)
		132	ret = p->ioprio;
		133	break;
		134	case IOPRIO_WHO_PGRP:
		135	if (!who)
		136	who = process_group(current);
		137	do_each_task_pid(who, PIDTYPE_PGID, p) {
		138	if (ret == -ESRCH)
		139	ret = p->ioprio;
		140	else
		141	ret = ioprio_best(ret, p->ioprio);
		142	} while_each_task_pid(who, PIDTYPE_PGID, p);
		143	break;
		144	case IOPRIO_WHO_USER:
		145	if (!who)
		146	user = current->user;
		147	else
		148	user = find_user(who);
		149
		150	if (!user)
		151	break;
		152
		153	do_each_thread(g, p) {
		154	if (p->uid != user->uid)
		155	continue;
		156	if (ret == -ESRCH)
		157	ret = p->ioprio;
		158	else
		159	ret = ioprio_best(ret, p->ioprio);
		160	} while_each_thread(g, p);
		161
		162	if (who)
		163	free_uid(user);
		164	break;
		165	default:
		166	ret = -EINVAL;
		167	}
		168
		169	read_unlock_irq(&tasklist_lock);
		170	return ret;
		171	}
		172


diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 7b87707acc36..d1bcf0da6728 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c
@@ -645,18 +645,22 @@ struct buffer_chunk {
645		645
646	static void write_chunk(struct buffer_chunk *chunk) {	646	static void write_chunk(struct buffer_chunk *chunk) {
647	int i;	647	int i;
		648	get_fs_excl();
648	for (i = 0; i < chunk->nr ; i++) {	649	for (i = 0; i < chunk->nr ; i++) {
649	submit_logged_buffer(chunk->bh[i]) ;	650	submit_logged_buffer(chunk->bh[i]) ;
650	}	651	}
651	chunk->nr = 0;	652	chunk->nr = 0;
		653	put_fs_excl();
652	}	654	}
653		655
654	static void write_ordered_chunk(struct buffer_chunk *chunk) {	656	static void write_ordered_chunk(struct buffer_chunk *chunk) {
655	int i;	657	int i;
		658	get_fs_excl();
656	for (i = 0; i < chunk->nr ; i++) {	659	for (i = 0; i < chunk->nr ; i++) {
657	submit_ordered_buffer(chunk->bh[i]) ;	660	submit_ordered_buffer(chunk->bh[i]) ;
658	}	661	}
659	chunk->nr = 0;	662	chunk->nr = 0;
		663	put_fs_excl();
660	}	664	}
661		665
662	static int add_to_chunk(struct buffer_chunk chunk, struct buffer_head bh,	666	static int add_to_chunk(struct buffer_chunk chunk, struct buffer_head bh,
@@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
918	return 0 ;	922	return 0 ;
919	}	923	}
920		924
		925	get_fs_excl();
		926
921	/* before we can put our commit blocks on disk, we have to make sure everyone older than	927	/* before we can put our commit blocks on disk, we have to make sure everyone older than
922	** us is on disk too	928	** us is on disk too
923	*/	929	*/
@@ -1055,6 +1061,7 @@ put_jl:
1055		1061
1056	if (retval)	1062	if (retval)
1057	reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);	1063	reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
		1064	put_fs_excl();
1058	return retval;	1065	return retval;
1059	}	1066	}
1060		1067
@@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s,
1251	return 0 ;	1258	return 0 ;
1252	}	1259	}
1253		1260
		1261	get_fs_excl();
		1262
1254	/* if all the work is already done, get out of here */	1263	/* if all the work is already done, get out of here */
1255	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&	1264	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1256	atomic_read(&(jl->j_commit_left)) <= 0) {	1265	atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -1450,6 +1459,7 @@ flush_older_and_return:
1450	put_journal_list(s, jl);	1459	put_journal_list(s, jl);
1451	if (flushall)	1460	if (flushall)
1452	up(&journal->j_flush_sem);	1461	up(&journal->j_flush_sem);
		1462	put_fs_excl();
1453	return err ;	1463	return err ;
1454	}	1464	}
1455		1465
@@ -2719,6 +2729,7 @@ relock:
2719	th->t_trans_id = journal->j_trans_id ;	2729	th->t_trans_id = journal->j_trans_id ;
2720	unlock_journal(p_s_sb) ;	2730	unlock_journal(p_s_sb) ;
2721	INIT_LIST_HEAD (&th->t_list);	2731	INIT_LIST_HEAD (&th->t_list);
		2732	get_fs_excl();
2722	return 0 ;	2733	return 0 ;
2723		2734
2724	out_fail:	2735	out_fail:
@@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
3526	BUG_ON (th->t_refcount > 1);	3537	BUG_ON (th->t_refcount > 1);
3527	BUG_ON (!th->t_trans_id);	3538	BUG_ON (!th->t_trans_id);
3528		3539
		3540	put_fs_excl();
3529	current->journal_info = th->t_handle_save;	3541	current->journal_info = th->t_handle_save;
3530	reiserfs_check_lock_depth(p_s_sb, "journal end");	3542	reiserfs_check_lock_depth(p_s_sb, "journal end");
3531	if (journal->j_len == 0) {	3543	if (journal->j_len == 0) {