aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJens Axboe <axboe@suse.de>2005-06-27 04:55:12 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-27 17:33:29 -0400
commit22e2c507c301c3dbbcf91b4948b88f78842ee6c9 (patch)
tree9a97c91d1362e69703aa286021daffb8a5456f4c /include
parent020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff)
[PATCH] Update cfq io scheduler to time sliced design
This updates the CFQ io scheduler to the new time sliced design (cfq v3). It provides full process fairness, while giving excellent aggregate system throughput even for many competing processes. It supports io priorities, either inherited from the cpu nice value or set directly with the ioprio_get/set syscalls. The latter closely mimic set/getpriority. This import is based on my latest from -mm. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include')
-rw-r--r--include/asm-i386/unistd.h4
-rw-r--r--include/asm-ia64/unistd.h2
-rw-r--r--include/asm-ppc/unistd.h4
-rw-r--r--include/asm-x86_64/unistd.h6
-rw-r--r--include/linux/bio.h14
-rw-r--r--include/linux/blkdev.h25
-rw-r--r--include/linux/elevator.h8
-rw-r--r--include/linux/fs.h19
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/ioprio.h87
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/writeback.h6
12 files changed, 165 insertions, 18 deletions
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 176413fb9ae3..e25e4c71a879 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -294,8 +294,10 @@
294#define __NR_add_key 286 294#define __NR_add_key 286
295#define __NR_request_key 287 295#define __NR_request_key 287
296#define __NR_keyctl 288 296#define __NR_keyctl 288
297#define __NR_ioprio_set 289
298#define __NR_ioprio_get 290
297 299
298#define NR_syscalls 289 300#define NR_syscalls 291
299 301
300/* 302/*
301 * user-visible error numbers are in the range -1 - -128: see 303 * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index f7f43ec2483a..517f1649ee64 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -263,6 +263,8 @@
263#define __NR_add_key 1271 263#define __NR_add_key 1271
264#define __NR_request_key 1272 264#define __NR_request_key 1272
265#define __NR_keyctl 1273 265#define __NR_keyctl 1273
266#define __NR_ioprio_set 1274
267#define __NR_ioprio_get 1275
266#define __NR_set_zone_reclaim 1276 268#define __NR_set_zone_reclaim 1276
267 269
268#ifdef __KERNEL__ 270#ifdef __KERNEL__
diff --git a/include/asm-ppc/unistd.h b/include/asm-ppc/unistd.h
index cc51e5c9acc2..e8b79220b29c 100644
--- a/include/asm-ppc/unistd.h
+++ b/include/asm-ppc/unistd.h
@@ -277,8 +277,10 @@
277#define __NR_request_key 270 277#define __NR_request_key 270
278#define __NR_keyctl 271 278#define __NR_keyctl 271
279#define __NR_waitid 272 279#define __NR_waitid 272
280#define __NR_ioprio_set 273
281#define __NR_ioprio_get 274
280 282
281#define __NR_syscalls 273 283#define __NR_syscalls 275
282 284
283#define __NR(n) #n 285#define __NR(n) #n
284 286
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index d767adcbf0ff..6560439a83e4 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key)
561__SYSCALL(__NR_request_key, sys_request_key) 561__SYSCALL(__NR_request_key, sys_request_key)
562#define __NR_keyctl 250 562#define __NR_keyctl 250
563__SYSCALL(__NR_keyctl, sys_keyctl) 563__SYSCALL(__NR_keyctl, sys_keyctl)
564#define __NR_ioprio_set 251
565__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
566#define __NR_ioprio_get 252
567__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
564 568
565#define __NR_syscall_max __NR_keyctl 569#define __NR_syscall_max __NR_ioprio_get
566#ifndef __NO_STUBS 570#ifndef __NO_STUBS
567 571
568/* user-visible error numbers are in the range -1 - -4095 */ 572/* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 038022763f09..36ef29fa0d8b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -22,6 +22,7 @@
22 22
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/ioprio.h>
25 26
26/* Platforms may set this to teach the BIO layer about IOMMU hardware. */ 27/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
27#include <asm/io.h> 28#include <asm/io.h>
@@ -150,6 +151,19 @@ struct bio {
150#define BIO_RW_SYNC 4 151#define BIO_RW_SYNC 4
151 152
152/* 153/*
154 * upper 16 bits of bi_rw define the io priority of this bio
155 */
156#define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS)
157#define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT)
158#define bio_prio_valid(bio) ioprio_valid(bio_prio(bio))
159
160#define bio_set_prio(bio, prio) do { \
161 WARN_ON(prio >= (1 << IOPRIO_BITS)); \
162 (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \
163 (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \
164} while (0)
165
166/*
153 * various member access, note that bio_data should of course not be used 167 * various member access, note that bio_data should of course not be used
154 * on highmem page vectors 168 * on highmem page vectors
155 */ 169 */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b54a0348a890..21a8674cd149 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -54,16 +54,23 @@ struct as_io_context {
54 54
55struct cfq_queue; 55struct cfq_queue;
56struct cfq_io_context { 56struct cfq_io_context {
57 void (*dtor)(struct cfq_io_context *);
58 void (*exit)(struct cfq_io_context *);
59
60 struct io_context *ioc;
61
62 /* 57 /*
63 * circular list of cfq_io_contexts belonging to a process io context 58 * circular list of cfq_io_contexts belonging to a process io context
64 */ 59 */
65 struct list_head list; 60 struct list_head list;
66 struct cfq_queue *cfqq; 61 struct cfq_queue *cfqq;
62 void *key;
63
64 struct io_context *ioc;
65
66 unsigned long last_end_request;
67 unsigned long last_queue;
68 unsigned long ttime_total;
69 unsigned long ttime_samples;
70 unsigned long ttime_mean;
71
72 void (*dtor)(struct cfq_io_context *);
73 void (*exit)(struct cfq_io_context *);
67}; 74};
68 75
69/* 76/*
@@ -73,7 +80,9 @@ struct cfq_io_context {
73 */ 80 */
74struct io_context { 81struct io_context {
75 atomic_t refcount; 82 atomic_t refcount;
76 pid_t pid; 83 struct task_struct *task;
84
85 int (*set_ioprio)(struct io_context *, unsigned int);
77 86
78 /* 87 /*
79 * For request batching 88 * For request batching
@@ -81,8 +90,6 @@ struct io_context {
81 unsigned long last_waited; /* Time last woken after wait for request */ 90 unsigned long last_waited; /* Time last woken after wait for request */
82 int nr_batch_requests; /* Number of requests left in the batch */ 91 int nr_batch_requests; /* Number of requests left in the batch */
83 92
84 spinlock_t lock;
85
86 struct as_io_context *aic; 93 struct as_io_context *aic;
87 struct cfq_io_context *cic; 94 struct cfq_io_context *cic;
88}; 95};
@@ -134,6 +141,8 @@ struct request {
134 141
135 void *elevator_private; 142 void *elevator_private;
136 143
144 unsigned short ioprio;
145
137 int rq_status; /* should split this into a few status bits */ 146 int rq_status; /* should split this into a few status bits */
138 struct gendisk *rq_disk; 147 struct gendisk *rq_disk;
139 int errors; 148 int errors;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ee54f81faad5..ea6bbc2d7407 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
16typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); 16typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *);
17typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); 17typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
18typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); 18typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
19typedef int (elevator_may_queue_fn) (request_queue_t *, int); 19typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *);
20 20
21typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); 21typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int);
22typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); 22typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
23typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); 23typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);
24 24
@@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *);
96extern struct request *elv_latter_request(request_queue_t *, struct request *); 96extern struct request *elv_latter_request(request_queue_t *, struct request *);
97extern int elv_register_queue(request_queue_t *q); 97extern int elv_register_queue(request_queue_t *q);
98extern void elv_unregister_queue(request_queue_t *q); 98extern void elv_unregister_queue(request_queue_t *q);
99extern int elv_may_queue(request_queue_t *, int); 99extern int elv_may_queue(request_queue_t *, int, struct bio *);
100extern void elv_completed_request(request_queue_t *, struct request *); 100extern void elv_completed_request(request_queue_t *, struct request *);
101extern int elv_set_request(request_queue_t *, struct request *, int); 101extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int);
102extern void elv_put_request(request_queue_t *, struct request *); 102extern void elv_put_request(request_queue_t *, struct request *);
103 103
104/* 104/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3ae8e37bdfc8..047bde30836a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -213,6 +213,7 @@ extern int dir_notify_enable;
213#include <linux/radix-tree.h> 213#include <linux/radix-tree.h>
214#include <linux/prio_tree.h> 214#include <linux/prio_tree.h>
215#include <linux/init.h> 215#include <linux/init.h>
216#include <linux/sched.h>
216 217
217#include <asm/atomic.h> 218#include <asm/atomic.h>
218#include <asm/semaphore.h> 219#include <asm/semaphore.h>
@@ -822,16 +823,34 @@ enum {
822#define vfs_check_frozen(sb, level) \ 823#define vfs_check_frozen(sb, level) \
823 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) 824 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
824 825
826static inline void get_fs_excl(void)
827{
828 atomic_inc(&current->fs_excl);
829}
830
831static inline void put_fs_excl(void)
832{
833 atomic_dec(&current->fs_excl);
834}
835
836static inline int has_fs_excl(void)
837{
838 return atomic_read(&current->fs_excl);
839}
840
841
825/* 842/*
826 * Superblock locking. 843 * Superblock locking.
827 */ 844 */
828static inline void lock_super(struct super_block * sb) 845static inline void lock_super(struct super_block * sb)
829{ 846{
847 get_fs_excl();
830 down(&sb->s_lock); 848 down(&sb->s_lock);
831} 849}
832 850
833static inline void unlock_super(struct super_block * sb) 851static inline void unlock_super(struct super_block * sb)
834{ 852{
853 put_fs_excl();
835 up(&sb->s_lock); 854 up(&sb->s_lock);
836} 855}
837 856
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 03206a425d7a..c727c195a91a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -81,6 +81,7 @@ extern struct group_info init_groups;
81 .mm = NULL, \ 81 .mm = NULL, \
82 .active_mm = &init_mm, \ 82 .active_mm = &init_mm, \
83 .run_list = LIST_HEAD_INIT(tsk.run_list), \ 83 .run_list = LIST_HEAD_INIT(tsk.run_list), \
84 .ioprio = 0, \
84 .time_slice = HZ, \ 85 .time_slice = HZ, \
85 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 86 .tasks = LIST_HEAD_INIT(tsk.tasks), \
86 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ 87 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
@@ -110,6 +111,7 @@ extern struct group_info init_groups;
110 .proc_lock = SPIN_LOCK_UNLOCKED, \ 111 .proc_lock = SPIN_LOCK_UNLOCKED, \
111 .journal_info = NULL, \ 112 .journal_info = NULL, \
112 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ 113 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
114 .fs_excl = ATOMIC_INIT(0), \
113} 115}
114 116
115 117
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
new file mode 100644
index 000000000000..7811300d88ee
--- /dev/null
+++ b/include/linux/ioprio.h
@@ -0,0 +1,87 @@
1#ifndef IOPRIO_H
2#define IOPRIO_H
3
4#include <linux/sched.h>
5
6/*
7 * Gives us 8 prio classes with 13-bits of data for each class
8 */
9#define IOPRIO_BITS (16)
10#define IOPRIO_CLASS_SHIFT (13)
11#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
12
13#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT)
14#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
15
16#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
17
18/*
19 * These are the io priority groups as implemented by CFQ. RT is the realtime
20 * class, it always gets premium service. BE is the best-effort scheduling
21 * class, the default for any process. IDLE is the idle scheduling class, it
22 * is only served when no one else is using the disk.
23 */
24enum {
25 IOPRIO_CLASS_NONE,
26 IOPRIO_CLASS_RT,
27 IOPRIO_CLASS_BE,
28 IOPRIO_CLASS_IDLE,
29};
30
31/*
32 * 8 best effort priority levels are supported
33 */
34#define IOPRIO_BE_NR (8)
35
36asmlinkage int sys_ioprio_set(int, int, int);
37asmlinkage int sys_ioprio_get(int, int);
38
39enum {
40 IOPRIO_WHO_PROCESS = 1,
41 IOPRIO_WHO_PGRP,
42 IOPRIO_WHO_USER,
43};
44
45/*
46 * if process has set io priority explicitly, use that. if not, convert
47 * the cpu scheduler nice value to an io priority
48 */
49#define IOPRIO_NORM (4)
50static inline int task_ioprio(struct task_struct *task)
51{
52 WARN_ON(!ioprio_valid(task->ioprio));
53 return IOPRIO_PRIO_DATA(task->ioprio);
54}
55
56static inline int task_nice_ioprio(struct task_struct *task)
57{
58 return (task_nice(task) + 20) / 5;
59}
60
61/*
62 * For inheritance, return the highest of the two given priorities
63 */
64static inline int ioprio_best(unsigned short aprio, unsigned short bprio)
65{
66 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
67 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
68
69 if (!ioprio_valid(aprio))
70 return bprio;
71 if (!ioprio_valid(bprio))
72 return aprio;
73
74 if (aclass == IOPRIO_CLASS_NONE)
75 aclass = IOPRIO_CLASS_BE;
76 if (bclass == IOPRIO_CLASS_NONE)
77 bclass = IOPRIO_CLASS_BE;
78
79 if (aclass == bclass)
80 return min(aprio, bprio);
81 if (aclass > bclass)
82 return bprio;
83 else
84 return aprio;
85}
86
87#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9530b1903160..ff48815bd3a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -608,6 +608,8 @@ struct task_struct {
608 struct list_head run_list; 608 struct list_head run_list;
609 prio_array_t *array; 609 prio_array_t *array;
610 610
611 unsigned short ioprio;
612
611 unsigned long sleep_avg; 613 unsigned long sleep_avg;
612 unsigned long long timestamp, last_ran; 614 unsigned long long timestamp, last_ran;
613 unsigned long long sched_time; /* sched_clock time spent running */ 615 unsigned long long sched_time; /* sched_clock time spent running */
@@ -763,6 +765,7 @@ struct task_struct {
763 nodemask_t mems_allowed; 765 nodemask_t mems_allowed;
764 int cpuset_mems_generation; 766 int cpuset_mems_generation;
765#endif 767#endif
768 atomic_t fs_excl; /* holding fs exclusive resources */
766}; 769};
767 770
768static inline pid_t process_group(struct task_struct *tsk) 771static inline pid_t process_group(struct task_struct *tsk)
@@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p);
1112 1115
1113/* 1116/*
1114 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring 1117 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
1115 * subscriptions and synchronises with wait4(). Also used in procfs. 1118 * subscriptions and synchronises with wait4(). Also used in procfs. Also
1119 * pins the final release of task.io_context.
1116 * 1120 *
1117 * Nests both inside and outside of read_lock(&tasklist_lock). 1121 * Nests both inside and outside of read_lock(&tasklist_lock).
1118 * It must not be nested with write_lock_irq(&tasklist_lock), 1122 * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1262cb43c3ab..d5c3fe1bf33d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -14,11 +14,13 @@ extern struct list_head inode_unused;
14 * Yes, writeback.h requires sched.h 14 * Yes, writeback.h requires sched.h
15 * No, sched.h is not included from here. 15 * No, sched.h is not included from here.
16 */ 16 */
17static inline int current_is_pdflush(void) 17static inline int task_is_pdflush(struct task_struct *task)
18{ 18{
19 return current->flags & PF_FLUSHER; 19 return task->flags & PF_FLUSHER;
20} 20}
21 21
22#define current_is_pdflush() task_is_pdflush(current)
23
22/* 24/*
23 * fs/fs-writeback.c 25 * fs/fs-writeback.c
24 */ 26 */