aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
committerSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
commit86579dd06deecfa6ac88d5e84e4d63c397cd6f6d (patch)
treeb4475d3ccde53015ad84a06e4e55e64591171b75 /block
parent7ea9ea832212c4a755650f7c7cc1ff0b63292a41 (diff)
parenta0f067802576d4eb4c65d40b8ee7d6ea3c81dd61 (diff)
Merge branch 'master'
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig22
-rw-r--r--block/Makefile2
-rw-r--r--block/as-iosched.c144
-rw-r--r--block/blktrace.c538
-rw-r--r--block/cfq-iosched.c625
-rw-r--r--block/deadline-iosched.c116
-rw-r--r--block/elevator.c178
-rw-r--r--block/genhd.c37
-rw-r--r--block/ioctl.c28
-rw-r--r--block/ll_rw_blk.c188
10 files changed, 1260 insertions, 618 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e..5536839886 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,26 @@ config LBD
11 your machine, or if you want to have a raid or loopback device 11 your machine, or if you want to have a raid or loopback device
12 bigger than 2TB. Otherwise say N. 12 bigger than 2TB. Otherwise say N.
13 13
14config BLK_DEV_IO_TRACE
15 bool "Support for tracing block io actions"
16 depends on SYSFS
17 select RELAY
18 select DEBUG_FS
19 help
20 Say Y here, if you want to be able to trace the block layer actions
21 on a given queue. Tracing allows you to see any traffic happening
22 on a block device queue. For more information (and the user space
23 support tools needed), fetch the blktrace app from:
24
25 git://brick.kernel.dk/data/git/blktrace.git
26
27config LSF
28 bool "Support for Large Single Files"
29 depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
30 default n
31 help
32 When CONFIG_LBD is disabled, say Y here if you want to
33 handle large file(bigger than 2TB), otherwise say N.
34 When CONFIG_LBD is enabled, Y is set automatically.
35
14source block/Kconfig.iosched 36source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b4..c05de0e003 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
8obj-$(CONFIG_IOSCHED_AS) += as-iosched.o 8obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
9obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 9obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
10obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 10obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
11
12obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 8da3cf6689..296708cece 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -182,6 +182,9 @@ struct as_rq {
182 182
183static kmem_cache_t *arq_pool; 183static kmem_cache_t *arq_pool;
184 184
185static atomic_t ioc_count = ATOMIC_INIT(0);
186static struct completion *ioc_gone;
187
185static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq); 188static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
186static void as_antic_stop(struct as_data *ad); 189static void as_antic_stop(struct as_data *ad);
187 190
@@ -193,6 +196,15 @@ static void as_antic_stop(struct as_data *ad);
193static void free_as_io_context(struct as_io_context *aic) 196static void free_as_io_context(struct as_io_context *aic)
194{ 197{
195 kfree(aic); 198 kfree(aic);
199 if (atomic_dec_and_test(&ioc_count) && ioc_gone)
200 complete(ioc_gone);
201}
202
203static void as_trim(struct io_context *ioc)
204{
205 if (ioc->aic)
206 free_as_io_context(ioc->aic);
207 ioc->aic = NULL;
196} 208}
197 209
198/* Called when the task exits */ 210/* Called when the task exits */
@@ -220,6 +232,7 @@ static struct as_io_context *alloc_as_io_context(void)
220 ret->seek_total = 0; 232 ret->seek_total = 0;
221 ret->seek_samples = 0; 233 ret->seek_samples = 0;
222 ret->seek_mean = 0; 234 ret->seek_mean = 0;
235 atomic_inc(&ioc_count);
223 } 236 }
224 237
225 return ret; 238 return ret;
@@ -1696,11 +1709,6 @@ static int as_init_queue(request_queue_t *q, elevator_t *e)
1696/* 1709/*
1697 * sysfs parts below 1710 * sysfs parts below
1698 */ 1711 */
1699struct as_fs_entry {
1700 struct attribute attr;
1701 ssize_t (*show)(struct as_data *, char *);
1702 ssize_t (*store)(struct as_data *, const char *, size_t);
1703};
1704 1712
1705static ssize_t 1713static ssize_t
1706as_var_show(unsigned int var, char *page) 1714as_var_show(unsigned int var, char *page)
@@ -1717,8 +1725,9 @@ as_var_store(unsigned long *var, const char *page, size_t count)
1717 return count; 1725 return count;
1718} 1726}
1719 1727
1720static ssize_t as_est_show(struct as_data *ad, char *page) 1728static ssize_t est_time_show(elevator_t *e, char *page)
1721{ 1729{
1730 struct as_data *ad = e->elevator_data;
1722 int pos = 0; 1731 int pos = 0;
1723 1732
1724 pos += sprintf(page+pos, "%lu %% exit probability\n", 1733 pos += sprintf(page+pos, "%lu %% exit probability\n",
@@ -1734,21 +1743,23 @@ static ssize_t as_est_show(struct as_data *ad, char *page)
1734} 1743}
1735 1744
1736#define SHOW_FUNCTION(__FUNC, __VAR) \ 1745#define SHOW_FUNCTION(__FUNC, __VAR) \
1737static ssize_t __FUNC(struct as_data *ad, char *page) \ 1746static ssize_t __FUNC(elevator_t *e, char *page) \
1738{ \ 1747{ \
1748 struct as_data *ad = e->elevator_data; \
1739 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ 1749 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1740} 1750}
1741SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]); 1751SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]);
1742SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]); 1752SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]);
1743SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire); 1753SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
1744SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]); 1754SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]);
1745SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]); 1755SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
1746#undef SHOW_FUNCTION 1756#undef SHOW_FUNCTION
1747 1757
1748#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ 1758#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1749static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \ 1759static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \
1750{ \ 1760{ \
1751 int ret = as_var_store(__PTR, (page), count); \ 1761 struct as_data *ad = e->elevator_data; \
1762 int ret = as_var_store(__PTR, (page), count); \
1752 if (*(__PTR) < (MIN)) \ 1763 if (*(__PTR) < (MIN)) \
1753 *(__PTR) = (MIN); \ 1764 *(__PTR) = (MIN); \
1754 else if (*(__PTR) > (MAX)) \ 1765 else if (*(__PTR) > (MAX)) \
@@ -1756,90 +1767,26 @@ static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \
1756 *(__PTR) = msecs_to_jiffies(*(__PTR)); \ 1767 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1757 return ret; \ 1768 return ret; \
1758} 1769}
1759STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); 1770STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
1760STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); 1771STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
1761STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX); 1772STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
1762STORE_FUNCTION(as_read_batchexpire_store, 1773STORE_FUNCTION(as_read_batch_expire_store,
1763 &ad->batch_expire[REQ_SYNC], 0, INT_MAX); 1774 &ad->batch_expire[REQ_SYNC], 0, INT_MAX);
1764STORE_FUNCTION(as_write_batchexpire_store, 1775STORE_FUNCTION(as_write_batch_expire_store,
1765 &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); 1776 &ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
1766#undef STORE_FUNCTION 1777#undef STORE_FUNCTION
1767 1778
1768static struct as_fs_entry as_est_entry = { 1779#define AS_ATTR(name) \
1769 .attr = {.name = "est_time", .mode = S_IRUGO }, 1780 __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
1770 .show = as_est_show, 1781
1771}; 1782static struct elv_fs_entry as_attrs[] = {
1772static struct as_fs_entry as_readexpire_entry = { 1783 __ATTR_RO(est_time),
1773 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, 1784 AS_ATTR(read_expire),
1774 .show = as_readexpire_show, 1785 AS_ATTR(write_expire),
1775 .store = as_readexpire_store, 1786 AS_ATTR(antic_expire),
1776}; 1787 AS_ATTR(read_batch_expire),
1777static struct as_fs_entry as_writeexpire_entry = { 1788 AS_ATTR(write_batch_expire),
1778 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR }, 1789 __ATTR_NULL
1779 .show = as_writeexpire_show,
1780 .store = as_writeexpire_store,
1781};
1782static struct as_fs_entry as_anticexpire_entry = {
1783 .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
1784 .show = as_anticexpire_show,
1785 .store = as_anticexpire_store,
1786};
1787static struct as_fs_entry as_read_batchexpire_entry = {
1788 .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
1789 .show = as_read_batchexpire_show,
1790 .store = as_read_batchexpire_store,
1791};
1792static struct as_fs_entry as_write_batchexpire_entry = {
1793 .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
1794 .show = as_write_batchexpire_show,
1795 .store = as_write_batchexpire_store,
1796};
1797
1798static struct attribute *default_attrs[] = {
1799 &as_est_entry.attr,
1800 &as_readexpire_entry.attr,
1801 &as_writeexpire_entry.attr,
1802 &as_anticexpire_entry.attr,
1803 &as_read_batchexpire_entry.attr,
1804 &as_write_batchexpire_entry.attr,
1805 NULL,
1806};
1807
1808#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
1809
1810static ssize_t
1811as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1812{
1813 elevator_t *e = container_of(kobj, elevator_t, kobj);
1814 struct as_fs_entry *entry = to_as(attr);
1815
1816 if (!entry->show)
1817 return -EIO;
1818
1819 return entry->show(e->elevator_data, page);
1820}
1821
1822static ssize_t
1823as_attr_store(struct kobject *kobj, struct attribute *attr,
1824 const char *page, size_t length)
1825{
1826 elevator_t *e = container_of(kobj, elevator_t, kobj);
1827 struct as_fs_entry *entry = to_as(attr);
1828
1829 if (!entry->store)
1830 return -EIO;
1831
1832 return entry->store(e->elevator_data, page, length);
1833}
1834
1835static struct sysfs_ops as_sysfs_ops = {
1836 .show = as_attr_show,
1837 .store = as_attr_store,
1838};
1839
1840static struct kobj_type as_ktype = {
1841 .sysfs_ops = &as_sysfs_ops,
1842 .default_attrs = default_attrs,
1843}; 1790};
1844 1791
1845static struct elevator_type iosched_as = { 1792static struct elevator_type iosched_as = {
@@ -1860,9 +1807,10 @@ static struct elevator_type iosched_as = {
1860 .elevator_may_queue_fn = as_may_queue, 1807 .elevator_may_queue_fn = as_may_queue,
1861 .elevator_init_fn = as_init_queue, 1808 .elevator_init_fn = as_init_queue,
1862 .elevator_exit_fn = as_exit_queue, 1809 .elevator_exit_fn = as_exit_queue,
1810 .trim = as_trim,
1863 }, 1811 },
1864 1812
1865 .elevator_ktype = &as_ktype, 1813 .elevator_attrs = as_attrs,
1866 .elevator_name = "anticipatory", 1814 .elevator_name = "anticipatory",
1867 .elevator_owner = THIS_MODULE, 1815 .elevator_owner = THIS_MODULE,
1868}; 1816};
@@ -1893,7 +1841,13 @@ static int __init as_init(void)
1893 1841
1894static void __exit as_exit(void) 1842static void __exit as_exit(void)
1895{ 1843{
1844 DECLARE_COMPLETION(all_gone);
1896 elv_unregister(&iosched_as); 1845 elv_unregister(&iosched_as);
1846 ioc_gone = &all_gone;
1847 barrier();
1848 if (atomic_read(&ioc_count))
1849 complete(ioc_gone);
1850 synchronize_rcu();
1897 kmem_cache_destroy(arq_pool); 1851 kmem_cache_destroy(arq_pool);
1898} 1852}
1899 1853
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 0000000000..36f3a17227
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
1/*
2 * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 *
17 */
18#include <linux/config.h>
19#include <linux/kernel.h>
20#include <linux/blkdev.h>
21#include <linux/blktrace_api.h>
22#include <linux/percpu.h>
23#include <linux/init.h>
24#include <linux/mutex.h>
25#include <linux/debugfs.h>
26#include <asm/uaccess.h>
27
28static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
29static unsigned int blktrace_seq __read_mostly = 1;
30
31/*
32 * Send out a notify for this process, if we haven't done so since a trace
33 * started
34 */
35static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
36{
37 struct blk_io_trace *t;
38
39 t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
40 if (t) {
41 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
42 t->device = bt->dev;
43 t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
44 t->pid = tsk->pid;
45 t->cpu = smp_processor_id();
46 t->pdu_len = sizeof(tsk->comm);
47 memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
48 tsk->btrace_seq = blktrace_seq;
49 }
50}
51
52static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
53 pid_t pid)
54{
55 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
56 return 1;
57 if (sector < bt->start_lba || sector > bt->end_lba)
58 return 1;
59 if (bt->pid && pid != bt->pid)
60 return 1;
61
62 return 0;
63}
64
65/*
66 * Data direction bit lookup
67 */
68static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
69
70/*
71 * Bio action bits of interest
72 */
73static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
74
75/*
76 * More could be added as needed, taking care to increment the decrementer
77 * to get correct indexing
78 */
79#define trace_barrier_bit(rw) \
80 (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
81#define trace_sync_bit(rw) \
82 (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
83
84/*
85 * The worker for the various blk_add_trace*() types. Fills out a
86 * blk_io_trace structure and places it in a per-cpu subbuffer.
87 */
88void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
89 int rw, u32 what, int error, int pdu_len, void *pdu_data)
90{
91 struct task_struct *tsk = current;
92 struct blk_io_trace *t;
93 unsigned long flags;
94 unsigned long *sequence;
95 pid_t pid;
96 int cpu;
97
98 if (unlikely(bt->trace_state != Blktrace_running))
99 return;
100
101 what |= ddir_act[rw & WRITE];
102 what |= bio_act[trace_barrier_bit(rw)];
103 what |= bio_act[trace_sync_bit(rw)];
104
105 pid = tsk->pid;
106 if (unlikely(act_log_check(bt, what, sector, pid)))
107 return;
108
109 /*
110 * A word about the locking here - we disable interrupts to reserve
111 * some space in the relay per-cpu buffer, to prevent an irq
112 * from coming in and stepping on our toes. Once reserved, it's
113 * enough to get preemption disabled to prevent read of this data
114 * before we are through filling it. get_cpu()/put_cpu() does this
115 * for us
116 */
117 local_irq_save(flags);
118
119 if (unlikely(tsk->btrace_seq != blktrace_seq))
120 trace_note_tsk(bt, tsk);
121
122 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
123 if (t) {
124 cpu = smp_processor_id();
125 sequence = per_cpu_ptr(bt->sequence, cpu);
126
127 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
128 t->sequence = ++(*sequence);
129 t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
130 t->sector = sector;
131 t->bytes = bytes;
132 t->action = what;
133 t->pid = pid;
134 t->device = bt->dev;
135 t->cpu = cpu;
136 t->error = error;
137 t->pdu_len = pdu_len;
138
139 if (pdu_len)
140 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
141 }
142
143 local_irq_restore(flags);
144}
145
146EXPORT_SYMBOL_GPL(__blk_add_trace);
147
148static struct dentry *blk_tree_root;
149static struct mutex blk_tree_mutex;
150static unsigned int root_users;
151
152static inline void blk_remove_root(void)
153{
154 if (blk_tree_root) {
155 debugfs_remove(blk_tree_root);
156 blk_tree_root = NULL;
157 }
158}
159
160static void blk_remove_tree(struct dentry *dir)
161{
162 mutex_lock(&blk_tree_mutex);
163 debugfs_remove(dir);
164 if (--root_users == 0)
165 blk_remove_root();
166 mutex_unlock(&blk_tree_mutex);
167}
168
169static struct dentry *blk_create_tree(const char *blk_name)
170{
171 struct dentry *dir = NULL;
172
173 mutex_lock(&blk_tree_mutex);
174
175 if (!blk_tree_root) {
176 blk_tree_root = debugfs_create_dir("block", NULL);
177 if (!blk_tree_root)
178 goto err;
179 }
180
181 dir = debugfs_create_dir(blk_name, blk_tree_root);
182 if (dir)
183 root_users++;
184 else
185 blk_remove_root();
186
187err:
188 mutex_unlock(&blk_tree_mutex);
189 return dir;
190}
191
192static void blk_trace_cleanup(struct blk_trace *bt)
193{
194 relay_close(bt->rchan);
195 debugfs_remove(bt->dropped_file);
196 blk_remove_tree(bt->dir);
197 free_percpu(bt->sequence);
198 kfree(bt);
199}
200
201static int blk_trace_remove(request_queue_t *q)
202{
203 struct blk_trace *bt;
204
205 bt = xchg(&q->blk_trace, NULL);
206 if (!bt)
207 return -EINVAL;
208
209 if (bt->trace_state == Blktrace_setup ||
210 bt->trace_state == Blktrace_stopped)
211 blk_trace_cleanup(bt);
212
213 return 0;
214}
215
216static int blk_dropped_open(struct inode *inode, struct file *filp)
217{
218 filp->private_data = inode->u.generic_ip;
219
220 return 0;
221}
222
223static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
224 size_t count, loff_t *ppos)
225{
226 struct blk_trace *bt = filp->private_data;
227 char buf[16];
228
229 snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
230
231 return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
232}
233
234static struct file_operations blk_dropped_fops = {
235 .owner = THIS_MODULE,
236 .open = blk_dropped_open,
237 .read = blk_dropped_read,
238};
239
240/*
241 * Keep track of how many times we encountered a full subbuffer, to aid
242 * the user space app in telling how many lost events there were.
243 */
244static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
245 void *prev_subbuf, size_t prev_padding)
246{
247 struct blk_trace *bt;
248
249 if (!relay_buf_full(buf))
250 return 1;
251
252 bt = buf->chan->private_data;
253 atomic_inc(&bt->dropped);
254 return 0;
255}
256
257static int blk_remove_buf_file_callback(struct dentry *dentry)
258{
259 debugfs_remove(dentry);
260 return 0;
261}
262
263static struct dentry *blk_create_buf_file_callback(const char *filename,
264 struct dentry *parent,
265 int mode,
266 struct rchan_buf *buf,
267 int *is_global)
268{
269 return debugfs_create_file(filename, mode, parent, buf,
270 &relay_file_operations);
271}
272
273static struct rchan_callbacks blk_relay_callbacks = {
274 .subbuf_start = blk_subbuf_start_callback,
275 .create_buf_file = blk_create_buf_file_callback,
276 .remove_buf_file = blk_remove_buf_file_callback,
277};
278
279/*
280 * Setup everything required to start tracing
281 */
282static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
283 char __user *arg)
284{
285 struct blk_user_trace_setup buts;
286 struct blk_trace *old_bt, *bt = NULL;
287 struct dentry *dir = NULL;
288 char b[BDEVNAME_SIZE];
289 int ret, i;
290
291 if (copy_from_user(&buts, arg, sizeof(buts)))
292 return -EFAULT;
293
294 if (!buts.buf_size || !buts.buf_nr)
295 return -EINVAL;
296
297 strcpy(buts.name, bdevname(bdev, b));
298
299 /*
300 * some device names have larger paths - convert the slashes
301 * to underscores for this to work as expected
302 */
303 for (i = 0; i < strlen(buts.name); i++)
304 if (buts.name[i] == '/')
305 buts.name[i] = '_';
306
307 if (copy_to_user(arg, &buts, sizeof(buts)))
308 return -EFAULT;
309
310 ret = -ENOMEM;
311 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
312 if (!bt)
313 goto err;
314
315 bt->sequence = alloc_percpu(unsigned long);
316 if (!bt->sequence)
317 goto err;
318
319 ret = -ENOENT;
320 dir = blk_create_tree(buts.name);
321 if (!dir)
322 goto err;
323
324 bt->dir = dir;
325 bt->dev = bdev->bd_dev;
326 atomic_set(&bt->dropped, 0);
327
328 ret = -EIO;
329 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
330 if (!bt->dropped_file)
331 goto err;
332
333 bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
334 if (!bt->rchan)
335 goto err;
336 bt->rchan->private_data = bt;
337
338 bt->act_mask = buts.act_mask;
339 if (!bt->act_mask)
340 bt->act_mask = (u16) -1;
341
342 bt->start_lba = buts.start_lba;
343 bt->end_lba = buts.end_lba;
344 if (!bt->end_lba)
345 bt->end_lba = -1ULL;
346
347 bt->pid = buts.pid;
348 bt->trace_state = Blktrace_setup;
349
350 ret = -EBUSY;
351 old_bt = xchg(&q->blk_trace, bt);
352 if (old_bt) {
353 (void) xchg(&q->blk_trace, old_bt);
354 goto err;
355 }
356
357 return 0;
358err:
359 if (dir)
360 blk_remove_tree(dir);
361 if (bt) {
362 if (bt->dropped_file)
363 debugfs_remove(bt->dropped_file);
364 if (bt->sequence)
365 free_percpu(bt->sequence);
366 if (bt->rchan)
367 relay_close(bt->rchan);
368 kfree(bt);
369 }
370 return ret;
371}
372
373static int blk_trace_startstop(request_queue_t *q, int start)
374{
375 struct blk_trace *bt;
376 int ret;
377
378 if ((bt = q->blk_trace) == NULL)
379 return -EINVAL;
380
381 /*
382 * For starting a trace, we can transition from a setup or stopped
383 * trace. For stopping a trace, the state must be running
384 */
385 ret = -EINVAL;
386 if (start) {
387 if (bt->trace_state == Blktrace_setup ||
388 bt->trace_state == Blktrace_stopped) {
389 blktrace_seq++;
390 smp_mb();
391 bt->trace_state = Blktrace_running;
392 ret = 0;
393 }
394 } else {
395 if (bt->trace_state == Blktrace_running) {
396 bt->trace_state = Blktrace_stopped;
397 relay_flush(bt->rchan);
398 ret = 0;
399 }
400 }
401
402 return ret;
403}
404
405/**
406 * blk_trace_ioctl: - handle the ioctls associated with tracing
407 * @bdev: the block device
408 * @cmd: the ioctl cmd
409 * @arg: the argument data, if any
410 *
411 **/
412int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
413{
414 request_queue_t *q;
415 int ret, start = 0;
416
417 q = bdev_get_queue(bdev);
418 if (!q)
419 return -ENXIO;
420
421 mutex_lock(&bdev->bd_mutex);
422
423 switch (cmd) {
424 case BLKTRACESETUP:
425 ret = blk_trace_setup(q, bdev, arg);
426 break;
427 case BLKTRACESTART:
428 start = 1;
429 case BLKTRACESTOP:
430 ret = blk_trace_startstop(q, start);
431 break;
432 case BLKTRACETEARDOWN:
433 ret = blk_trace_remove(q);
434 break;
435 default:
436 ret = -ENOTTY;
437 break;
438 }
439
440 mutex_unlock(&bdev->bd_mutex);
441 return ret;
442}
443
444/**
445 * blk_trace_shutdown: - stop and cleanup trace structures
446 * @q: the request queue associated with the device
447 *
448 **/
449void blk_trace_shutdown(request_queue_t *q)
450{
451 blk_trace_startstop(q, 0);
452 blk_trace_remove(q);
453}
454
455/*
456 * Average offset over two calls to sched_clock() with a gettimeofday()
457 * in the middle
458 */
459static void blk_check_time(unsigned long long *t)
460{
461 unsigned long long a, b;
462 struct timeval tv;
463
464 a = sched_clock();
465 do_gettimeofday(&tv);
466 b = sched_clock();
467
468 *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
469 *t -= (a + b) / 2;
470}
471
472static void blk_trace_check_cpu_time(void *data)
473{
474 unsigned long long *t;
475 int cpu = get_cpu();
476
477 t = &per_cpu(blk_trace_cpu_offset, cpu);
478
479 /*
480 * Just call it twice, hopefully the second call will be cache hot
481 * and a little more precise
482 */
483 blk_check_time(t);
484 blk_check_time(t);
485
486 put_cpu();
487}
488
489/*
490 * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
491 * timings
492 */
493static void blk_trace_calibrate_offsets(void)
494{
495 unsigned long flags;
496
497 smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
498 local_irq_save(flags);
499 blk_trace_check_cpu_time(NULL);
500 local_irq_restore(flags);
501}
502
503static void blk_trace_set_ht_offsets(void)
504{
505#if defined(CONFIG_SCHED_SMT)
506 int cpu, i;
507
508 /*
509 * now make sure HT siblings have the same time offset
510 */
511 preempt_disable();
512 for_each_online_cpu(cpu) {
513 unsigned long long *cpu_off, *sibling_off;
514
515 for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
516 if (i == cpu)
517 continue;
518
519 cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
520 sibling_off = &per_cpu(blk_trace_cpu_offset, i);
521 *sibling_off = *cpu_off;
522 }
523 }
524 preempt_enable();
525#endif
526}
527
528static __init int blk_trace_init(void)
529{
530 mutex_init(&blk_tree_mutex);
531 blk_trace_calibrate_offsets();
532 blk_trace_set_ht_offsets();
533
534 return 0;
535}
536
537module_init(blk_trace_init);
538
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c8dbe38c81..67d446de02 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -6,21 +6,13 @@
6 * 6 *
7 * Copyright (C) 2003 Jens Axboe <axboe@suse.de> 7 * Copyright (C) 2003 Jens Axboe <axboe@suse.de>
8 */ 8 */
9#include <linux/kernel.h>
10#include <linux/fs.h>
11#include <linux/blkdev.h>
12#include <linux/elevator.h>
13#include <linux/bio.h>
14#include <linux/config.h> 9#include <linux/config.h>
15#include <linux/module.h> 10#include <linux/module.h>
16#include <linux/slab.h> 11#include <linux/blkdev.h>
17#include <linux/init.h> 12#include <linux/elevator.h>
18#include <linux/compiler.h>
19#include <linux/hash.h> 13#include <linux/hash.h>
20#include <linux/rbtree.h> 14#include <linux/rbtree.h>
21#include <linux/mempool.h>
22#include <linux/ioprio.h> 15#include <linux/ioprio.h>
23#include <linux/writeback.h>
24 16
25/* 17/*
26 * tunables 18 * tunables
@@ -34,18 +26,14 @@ static const int cfq_back_penalty = 2; /* penalty of a backwards seek */
34static const int cfq_slice_sync = HZ / 10; 26static const int cfq_slice_sync = HZ / 10;
35static int cfq_slice_async = HZ / 25; 27static int cfq_slice_async = HZ / 25;
36static const int cfq_slice_async_rq = 2; 28static const int cfq_slice_async_rq = 2;
37static int cfq_slice_idle = HZ / 100; 29static int cfq_slice_idle = HZ / 70;
38 30
39#define CFQ_IDLE_GRACE (HZ / 10) 31#define CFQ_IDLE_GRACE (HZ / 10)
40#define CFQ_SLICE_SCALE (5) 32#define CFQ_SLICE_SCALE (5)
41 33
42#define CFQ_KEY_ASYNC (0) 34#define CFQ_KEY_ASYNC (0)
43#define CFQ_KEY_ANY (0xffff)
44 35
45/* 36static DEFINE_RWLOCK(cfq_exit_lock);
46 * disable queueing at the driver/hardware level
47 */
48static const int cfq_max_depth = 2;
49 37
50/* 38/*
51 * for the hash of cfqq inside the cfqd 39 * for the hash of cfqq inside the cfqd
@@ -89,6 +77,9 @@ static kmem_cache_t *crq_pool;
89static kmem_cache_t *cfq_pool; 77static kmem_cache_t *cfq_pool;
90static kmem_cache_t *cfq_ioc_pool; 78static kmem_cache_t *cfq_ioc_pool;
91 79
80static atomic_t ioc_count = ATOMIC_INIT(0);
81static struct completion *ioc_gone;
82
92#define CFQ_PRIO_LISTS IOPRIO_BE_NR 83#define CFQ_PRIO_LISTS IOPRIO_BE_NR
93#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 84#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
94#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) 85#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
@@ -105,11 +96,12 @@ static kmem_cache_t *cfq_ioc_pool;
105#define cfq_cfqq_sync(cfqq) \ 96#define cfq_cfqq_sync(cfqq) \
106 (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC]) 97 (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
107 98
99#define sample_valid(samples) ((samples) > 80)
100
108/* 101/*
109 * Per block device queue structure 102 * Per block device queue structure
110 */ 103 */
111struct cfq_data { 104struct cfq_data {
112 atomic_t ref;
113 request_queue_t *queue; 105 request_queue_t *queue;
114 106
115 /* 107 /*
@@ -174,7 +166,8 @@ struct cfq_data {
174 unsigned int cfq_slice[2]; 166 unsigned int cfq_slice[2];
175 unsigned int cfq_slice_async_rq; 167 unsigned int cfq_slice_async_rq;
176 unsigned int cfq_slice_idle; 168 unsigned int cfq_slice_idle;
177 unsigned int cfq_max_depth; 169
170 struct list_head cic_list;
178}; 171};
179 172
180/* 173/*
@@ -288,7 +281,7 @@ CFQ_CRQ_FNS(is_sync);
288 281
289static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); 282static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
290static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *); 283static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
291static void cfq_put_cfqd(struct cfq_data *cfqd); 284static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
292 285
293#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE) 286#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE)
294 287
@@ -345,17 +338,27 @@ static int cfq_queue_empty(request_queue_t *q)
345 return !cfqd->busy_queues; 338 return !cfqd->busy_queues;
346} 339}
347 340
341static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
342{
343 if (rw == READ || process_sync(task))
344 return task->pid;
345
346 return CFQ_KEY_ASYNC;
347}
348
348/* 349/*
349 * Lifted from AS - choose which of crq1 and crq2 that is best served now. 350 * Lifted from AS - choose which of crq1 and crq2 that is best served now.
350 * We choose the request that is closest to the head right now. Distance 351 * We choose the request that is closest to the head right now. Distance
351 * behind the head are penalized and only allowed to a certain extent. 352 * behind the head is penalized and only allowed to a certain extent.
352 */ 353 */
353static struct cfq_rq * 354static struct cfq_rq *
354cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) 355cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
355{ 356{
356 sector_t last, s1, s2, d1 = 0, d2 = 0; 357 sector_t last, s1, s2, d1 = 0, d2 = 0;
357 int r1_wrap = 0, r2_wrap = 0; /* requests are behind the disk head */
358 unsigned long back_max; 358 unsigned long back_max;
359#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
360#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
361 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
359 362
360 if (crq1 == NULL || crq1 == crq2) 363 if (crq1 == NULL || crq1 == crq2)
361 return crq2; 364 return crq2;
@@ -387,35 +390,47 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
387 else if (s1 + back_max >= last) 390 else if (s1 + back_max >= last)
388 d1 = (last - s1) * cfqd->cfq_back_penalty; 391 d1 = (last - s1) * cfqd->cfq_back_penalty;
389 else 392 else
390 r1_wrap = 1; 393 wrap |= CFQ_RQ1_WRAP;
391 394
392 if (s2 >= last) 395 if (s2 >= last)
393 d2 = s2 - last; 396 d2 = s2 - last;
394 else if (s2 + back_max >= last) 397 else if (s2 + back_max >= last)
395 d2 = (last - s2) * cfqd->cfq_back_penalty; 398 d2 = (last - s2) * cfqd->cfq_back_penalty;
396 else 399 else
397 r2_wrap = 1; 400 wrap |= CFQ_RQ2_WRAP;
398 401
399 /* Found required data */ 402 /* Found required data */
400 if (!r1_wrap && r2_wrap) 403
401 return crq1; 404 /*
402 else if (!r2_wrap && r1_wrap) 405 * By doing switch() on the bit mask "wrap" we avoid having to
403 return crq2; 406 * check two variables for all permutations: --> faster!
404 else if (r1_wrap && r2_wrap) { 407 */
405 /* both behind the head */ 408 switch (wrap) {
406 if (s1 <= s2) 409 case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
410 if (d1 < d2)
407 return crq1; 411 return crq1;
408 else 412 else if (d2 < d1)
409 return crq2; 413 return crq2;
410 } 414 else {
415 if (s1 >= s2)
416 return crq1;
417 else
418 return crq2;
419 }
411 420
412 /* Both requests in front of the head */ 421 case CFQ_RQ2_WRAP:
413 if (d1 < d2)
414 return crq1; 422 return crq1;
415 else if (d2 < d1) 423 case CFQ_RQ1_WRAP:
416 return crq2; 424 return crq2;
417 else { 425 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
418 if (s1 >= s2) 426 default:
427 /*
428 * Since both rqs are wrapped,
429 * start with the one that's further behind head
430 * (--> only *one* back seek required),
431 * since back seek takes more time than forward.
432 */
433 if (s1 <= s2)
419 return crq1; 434 return crq1;
420 else 435 else
421 return crq2; 436 return crq2;
@@ -614,15 +629,20 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
614 cfq_add_crq_rb(crq); 629 cfq_add_crq_rb(crq);
615} 630}
616 631
617static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) 632static struct request *
618 633cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
619{ 634{
620 struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY); 635 struct task_struct *tsk = current;
636 pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
637 struct cfq_queue *cfqq;
621 struct rb_node *n; 638 struct rb_node *n;
639 sector_t sector;
622 640
641 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
623 if (!cfqq) 642 if (!cfqq)
624 goto out; 643 goto out;
625 644
645 sector = bio->bi_sector + bio_sectors(bio);
626 n = cfqq->sort_list.rb_node; 646 n = cfqq->sort_list.rb_node;
627 while (n) { 647 while (n) {
628 struct cfq_rq *crq = rb_entry_crq(n); 648 struct cfq_rq *crq = rb_entry_crq(n);
@@ -676,7 +696,7 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
676 goto out; 696 goto out;
677 } 697 }
678 698
679 __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); 699 __rq = cfq_find_rq_fmerge(cfqd, bio);
680 if (__rq && elv_rq_merge_ok(__rq, bio)) { 700 if (__rq && elv_rq_merge_ok(__rq, bio)) {
681 ret = ELEVATOR_FRONT_MERGE; 701 ret = ELEVATOR_FRONT_MERGE;
682 goto out; 702 goto out;
@@ -879,6 +899,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
879static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 899static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
880 900
881{ 901{
902 struct cfq_io_context *cic;
882 unsigned long sl; 903 unsigned long sl;
883 904
884 WARN_ON(!RB_EMPTY(&cfqq->sort_list)); 905 WARN_ON(!RB_EMPTY(&cfqq->sort_list));
@@ -894,13 +915,23 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
894 /* 915 /*
895 * task has exited, don't wait 916 * task has exited, don't wait
896 */ 917 */
897 if (cfqd->active_cic && !cfqd->active_cic->ioc->task) 918 cic = cfqd->active_cic;
919 if (!cic || !cic->ioc->task)
898 return 0; 920 return 0;
899 921
900 cfq_mark_cfqq_must_dispatch(cfqq); 922 cfq_mark_cfqq_must_dispatch(cfqq);
901 cfq_mark_cfqq_wait_request(cfqq); 923 cfq_mark_cfqq_wait_request(cfqq);
902 924
903 sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle); 925 sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
926
927 /*
928 * we don't want to idle for seeks, but we do want to allow
929 * fair distribution of slice time for a process doing back-to-back
930 * seeks. so allow a little bit of time for him to submit a new rq
931 */
932 if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072)
933 sl = 2;
934
904 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 935 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
905 return 1; 936 return 1;
906} 937}
@@ -1117,13 +1148,6 @@ cfq_dispatch_requests(request_queue_t *q, int force)
1117 if (cfqq) { 1148 if (cfqq) {
1118 int max_dispatch; 1149 int max_dispatch;
1119 1150
1120 /*
1121 * if idle window is disabled, allow queue buildup
1122 */
1123 if (!cfq_cfqq_idle_window(cfqq) &&
1124 cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1125 return 0;
1126
1127 cfq_clear_cfqq_must_dispatch(cfqq); 1151 cfq_clear_cfqq_must_dispatch(cfqq);
1128 cfq_clear_cfqq_wait_request(cfqq); 1152 cfq_clear_cfqq_wait_request(cfqq);
1129 del_timer(&cfqd->idle_slice_timer); 1153 del_timer(&cfqd->idle_slice_timer);
@@ -1160,8 +1184,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
1160 if (unlikely(cfqd->active_queue == cfqq)) 1184 if (unlikely(cfqd->active_queue == cfqq))
1161 __cfq_slice_expired(cfqd, cfqq, 0); 1185 __cfq_slice_expired(cfqd, cfqq, 0);
1162 1186
1163 cfq_put_cfqd(cfqq->cfqd);
1164
1165 /* 1187 /*
1166 * it's on the empty list and still hashed 1188 * it's on the empty list and still hashed
1167 */ 1189 */
@@ -1175,13 +1197,13 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1175 const int hashval) 1197 const int hashval)
1176{ 1198{
1177 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; 1199 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1178 struct hlist_node *entry, *next; 1200 struct hlist_node *entry;
1201 struct cfq_queue *__cfqq;
1179 1202
1180 hlist_for_each_safe(entry, next, hash_list) { 1203 hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1181 struct cfq_queue *__cfqq = list_entry_qhash(entry); 1204 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1182 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
1183 1205
1184 if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY)) 1206 if (__cfqq->key == key && (__p == prio || !prio))
1185 return __cfqq; 1207 return __cfqq;
1186 } 1208 }
1187 1209
@@ -1194,17 +1216,27 @@ cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1194 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); 1216 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1195} 1217}
1196 1218
1197static void cfq_free_io_context(struct cfq_io_context *cic) 1219static void cfq_free_io_context(struct io_context *ioc)
1198{ 1220{
1199 struct cfq_io_context *__cic; 1221 struct cfq_io_context *__cic;
1200 struct list_head *entry, *next; 1222 struct rb_node *n;
1223 int freed = 0;
1201 1224
1202 list_for_each_safe(entry, next, &cic->list) { 1225 while ((n = rb_first(&ioc->cic_root)) != NULL) {
1203 __cic = list_entry(entry, struct cfq_io_context, list); 1226 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1227 rb_erase(&__cic->rb_node, &ioc->cic_root);
1204 kmem_cache_free(cfq_ioc_pool, __cic); 1228 kmem_cache_free(cfq_ioc_pool, __cic);
1229 freed++;
1205 } 1230 }
1206 1231
1207 kmem_cache_free(cfq_ioc_pool, cic); 1232 if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
1233 complete(ioc_gone);
1234}
1235
1236static void cfq_trim(struct io_context *ioc)
1237{
1238 ioc->set_ioprio = NULL;
1239 cfq_free_io_context(ioc);
1208} 1240}
1209 1241
1210/* 1242/*
@@ -1212,43 +1244,57 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
1212 */ 1244 */
1213static void cfq_exit_single_io_context(struct cfq_io_context *cic) 1245static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1214{ 1246{
1215 struct cfq_data *cfqd = cic->cfqq->cfqd; 1247 struct cfq_data *cfqd = cic->key;
1216 request_queue_t *q = cfqd->queue; 1248 request_queue_t *q;
1249
1250 if (!cfqd)
1251 return;
1252
1253 q = cfqd->queue;
1217 1254
1218 WARN_ON(!irqs_disabled()); 1255 WARN_ON(!irqs_disabled());
1219 1256
1220 spin_lock(q->queue_lock); 1257 spin_lock(q->queue_lock);
1221 1258
1222 if (unlikely(cic->cfqq == cfqd->active_queue)) 1259 if (cic->cfqq[ASYNC]) {
1223 __cfq_slice_expired(cfqd, cic->cfqq, 0); 1260 if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
1261 __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
1262 cfq_put_queue(cic->cfqq[ASYNC]);
1263 cic->cfqq[ASYNC] = NULL;
1264 }
1224 1265
1225 cfq_put_queue(cic->cfqq); 1266 if (cic->cfqq[SYNC]) {
1226 cic->cfqq = NULL; 1267 if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
1268 __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
1269 cfq_put_queue(cic->cfqq[SYNC]);
1270 cic->cfqq[SYNC] = NULL;
1271 }
1272
1273 cic->key = NULL;
1274 list_del_init(&cic->queue_list);
1227 spin_unlock(q->queue_lock); 1275 spin_unlock(q->queue_lock);
1228} 1276}
1229 1277
1230/* 1278static void cfq_exit_io_context(struct io_context *ioc)
1231 * Another task may update the task cic list, if it is doing a queue lookup
1232 * on its behalf. cfq_cic_lock excludes such concurrent updates
1233 */
1234static void cfq_exit_io_context(struct cfq_io_context *cic)
1235{ 1279{
1236 struct cfq_io_context *__cic; 1280 struct cfq_io_context *__cic;
1237 struct list_head *entry;
1238 unsigned long flags; 1281 unsigned long flags;
1239 1282 struct rb_node *n;
1240 local_irq_save(flags);
1241 1283
1242 /* 1284 /*
1243 * put the reference this task is holding to the various queues 1285 * put the reference this task is holding to the various queues
1244 */ 1286 */
1245 list_for_each(entry, &cic->list) { 1287 read_lock_irqsave(&cfq_exit_lock, flags);
1246 __cic = list_entry(entry, struct cfq_io_context, list); 1288
1289 n = rb_first(&ioc->cic_root);
1290 while (n != NULL) {
1291 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1292
1247 cfq_exit_single_io_context(__cic); 1293 cfq_exit_single_io_context(__cic);
1294 n = rb_next(n);
1248 } 1295 }
1249 1296
1250 cfq_exit_single_io_context(cic); 1297 read_unlock_irqrestore(&cfq_exit_lock, flags);
1251 local_irq_restore(flags);
1252} 1298}
1253 1299
1254static struct cfq_io_context * 1300static struct cfq_io_context *
@@ -1257,15 +1303,18 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1257 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); 1303 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1258 1304
1259 if (cic) { 1305 if (cic) {
1260 INIT_LIST_HEAD(&cic->list); 1306 RB_CLEAR(&cic->rb_node);
1261 cic->cfqq = NULL;
1262 cic->key = NULL; 1307 cic->key = NULL;
1308 cic->cfqq[ASYNC] = NULL;
1309 cic->cfqq[SYNC] = NULL;
1263 cic->last_end_request = jiffies; 1310 cic->last_end_request = jiffies;
1264 cic->ttime_total = 0; 1311 cic->ttime_total = 0;
1265 cic->ttime_samples = 0; 1312 cic->ttime_samples = 0;
1266 cic->ttime_mean = 0; 1313 cic->ttime_mean = 0;
1267 cic->dtor = cfq_free_io_context; 1314 cic->dtor = cfq_free_io_context;
1268 cic->exit = cfq_exit_io_context; 1315 cic->exit = cfq_exit_io_context;
1316 INIT_LIST_HEAD(&cic->queue_list);
1317 atomic_inc(&ioc_count);
1269 } 1318 }
1270 1319
1271 return cic; 1320 return cic;
@@ -1318,14 +1367,27 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
1318 cfq_clear_cfqq_prio_changed(cfqq); 1367 cfq_clear_cfqq_prio_changed(cfqq);
1319} 1368}
1320 1369
1321static inline void changed_ioprio(struct cfq_queue *cfqq) 1370static inline void changed_ioprio(struct cfq_io_context *cic)
1322{ 1371{
1323 if (cfqq) { 1372 struct cfq_data *cfqd = cic->key;
1324 struct cfq_data *cfqd = cfqq->cfqd; 1373 struct cfq_queue *cfqq;
1325 1374 if (cfqd) {
1326 spin_lock(cfqd->queue->queue_lock); 1375 spin_lock(cfqd->queue->queue_lock);
1327 cfq_mark_cfqq_prio_changed(cfqq); 1376 cfqq = cic->cfqq[ASYNC];
1328 cfq_init_prio_data(cfqq); 1377 if (cfqq) {
1378 struct cfq_queue *new_cfqq;
1379 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC,
1380 cic->ioc->task, GFP_ATOMIC);
1381 if (new_cfqq) {
1382 cic->cfqq[ASYNC] = new_cfqq;
1383 cfq_put_queue(cfqq);
1384 }
1385 }
1386 cfqq = cic->cfqq[SYNC];
1387 if (cfqq) {
1388 cfq_mark_cfqq_prio_changed(cfqq);
1389 cfq_init_prio_data(cfqq);
1390 }
1329 spin_unlock(cfqd->queue->queue_lock); 1391 spin_unlock(cfqd->queue->queue_lock);
1330 } 1392 }
1331} 1393}
@@ -1335,24 +1397,34 @@ static inline void changed_ioprio(struct cfq_queue *cfqq)
1335 */ 1397 */
1336static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) 1398static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1337{ 1399{
1338 struct cfq_io_context *cic = ioc->cic; 1400 struct cfq_io_context *cic;
1401 struct rb_node *n;
1339 1402
1340 changed_ioprio(cic->cfqq); 1403 write_lock(&cfq_exit_lock);
1341 1404
1342 list_for_each_entry(cic, &cic->list, list) 1405 n = rb_first(&ioc->cic_root);
1343 changed_ioprio(cic->cfqq); 1406 while (n != NULL) {
1407 cic = rb_entry(n, struct cfq_io_context, rb_node);
1408
1409 changed_ioprio(cic);
1410 n = rb_next(n);
1411 }
1412
1413 write_unlock(&cfq_exit_lock);
1344 1414
1345 return 0; 1415 return 0;
1346} 1416}
1347 1417
1348static struct cfq_queue * 1418static struct cfq_queue *
1349cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio, 1419cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1350 gfp_t gfp_mask) 1420 gfp_t gfp_mask)
1351{ 1421{
1352 const int hashval = hash_long(key, CFQ_QHASH_SHIFT); 1422 const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1353 struct cfq_queue *cfqq, *new_cfqq = NULL; 1423 struct cfq_queue *cfqq, *new_cfqq = NULL;
1424 unsigned short ioprio;
1354 1425
1355retry: 1426retry:
1427 ioprio = tsk->ioprio;
1356 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); 1428 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1357 1429
1358 if (!cfqq) { 1430 if (!cfqq) {
@@ -1381,7 +1453,6 @@ retry:
1381 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); 1453 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1382 atomic_set(&cfqq->ref, 0); 1454 atomic_set(&cfqq->ref, 0);
1383 cfqq->cfqd = cfqd; 1455 cfqq->cfqd = cfqd;
1384 atomic_inc(&cfqd->ref);
1385 cfqq->service_last = 0; 1456 cfqq->service_last = 0;
1386 /* 1457 /*
1387 * set ->slice_left to allow preemption for a new process 1458 * set ->slice_left to allow preemption for a new process
@@ -1401,14 +1472,67 @@ out:
1401 return cfqq; 1472 return cfqq;
1402} 1473}
1403 1474
1475static struct cfq_io_context *
1476cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1477{
1478 struct rb_node *n = ioc->cic_root.rb_node;
1479 struct cfq_io_context *cic;
1480 void *key = cfqd;
1481
1482 while (n) {
1483 cic = rb_entry(n, struct cfq_io_context, rb_node);
1484
1485 if (key < cic->key)
1486 n = n->rb_left;
1487 else if (key > cic->key)
1488 n = n->rb_right;
1489 else
1490 return cic;
1491 }
1492
1493 return NULL;
1494}
1495
1496static inline void
1497cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1498 struct cfq_io_context *cic)
1499{
1500 struct rb_node **p = &ioc->cic_root.rb_node;
1501 struct rb_node *parent = NULL;
1502 struct cfq_io_context *__cic;
1503
1504 read_lock(&cfq_exit_lock);
1505
1506 cic->ioc = ioc;
1507 cic->key = cfqd;
1508
1509 ioc->set_ioprio = cfq_ioc_set_ioprio;
1510
1511 while (*p) {
1512 parent = *p;
1513 __cic = rb_entry(parent, struct cfq_io_context, rb_node);
1514
1515 if (cic->key < __cic->key)
1516 p = &(*p)->rb_left;
1517 else if (cic->key > __cic->key)
1518 p = &(*p)->rb_right;
1519 else
1520 BUG();
1521 }
1522
1523 rb_link_node(&cic->rb_node, parent, p);
1524 rb_insert_color(&cic->rb_node, &ioc->cic_root);
1525 list_add(&cic->queue_list, &cfqd->cic_list);
1526 read_unlock(&cfq_exit_lock);
1527}
1528
1404/* 1529/*
1405 * Setup general io context and cfq io context. There can be several cfq 1530 * Setup general io context and cfq io context. There can be several cfq
1406 * io contexts per general io context, if this process is doing io to more 1531 * io contexts per general io context, if this process is doing io to more
1407 * than one device managed by cfq. Note that caller is holding a reference to 1532 * than one device managed by cfq.
1408 * cfqq, so we don't need to worry about it disappearing
1409 */ 1533 */
1410static struct cfq_io_context * 1534static struct cfq_io_context *
1411cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask) 1535cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1412{ 1536{
1413 struct io_context *ioc = NULL; 1537 struct io_context *ioc = NULL;
1414 struct cfq_io_context *cic; 1538 struct cfq_io_context *cic;
@@ -1419,61 +1543,15 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
1419 if (!ioc) 1543 if (!ioc)
1420 return NULL; 1544 return NULL;
1421 1545
1422 if ((cic = ioc->cic) == NULL) { 1546 cic = cfq_cic_rb_lookup(cfqd, ioc);
1423 cic = cfq_alloc_io_context(cfqd, gfp_mask); 1547 if (cic)
1424 1548 goto out;
1425 if (cic == NULL)
1426 goto err;
1427
1428 /*
1429 * manually increment generic io_context usage count, it
1430 * cannot go away since we are already holding one ref to it
1431 */
1432 ioc->cic = cic;
1433 ioc->set_ioprio = cfq_ioc_set_ioprio;
1434 cic->ioc = ioc;
1435 cic->key = cfqd;
1436 atomic_inc(&cfqd->ref);
1437 } else {
1438 struct cfq_io_context *__cic;
1439
1440 /*
1441 * the first cic on the list is actually the head itself
1442 */
1443 if (cic->key == cfqd)
1444 goto out;
1445
1446 /*
1447 * cic exists, check if we already are there. linear search
1448 * should be ok here, the list will usually not be more than
1449 * 1 or a few entries long
1450 */
1451 list_for_each_entry(__cic, &cic->list, list) {
1452 /*
1453 * this process is already holding a reference to
1454 * this queue, so no need to get one more
1455 */
1456 if (__cic->key == cfqd) {
1457 cic = __cic;
1458 goto out;
1459 }
1460 }
1461 1549
1462 /* 1550 cic = cfq_alloc_io_context(cfqd, gfp_mask);
1463 * nope, process doesn't have a cic assoicated with this 1551 if (cic == NULL)
1464 * cfqq yet. get a new one and add to list 1552 goto err;
1465 */
1466 __cic = cfq_alloc_io_context(cfqd, gfp_mask);
1467 if (__cic == NULL)
1468 goto err;
1469
1470 __cic->ioc = ioc;
1471 __cic->key = cfqd;
1472 atomic_inc(&cfqd->ref);
1473 list_add(&__cic->list, &cic->list);
1474 cic = __cic;
1475 }
1476 1553
1554 cfq_cic_link(cfqd, ioc, cic);
1477out: 1555out:
1478 return cic; 1556 return cic;
1479err: 1557err:
@@ -1506,7 +1584,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1506 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 1584 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1507} 1585}
1508 1586
1509#define sample_valid(samples) ((samples) > 80) 1587static void
1588cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1589 struct cfq_rq *crq)
1590{
1591 sector_t sdist;
1592 u64 total;
1593
1594 if (cic->last_request_pos < crq->request->sector)
1595 sdist = crq->request->sector - cic->last_request_pos;
1596 else
1597 sdist = cic->last_request_pos - crq->request->sector;
1598
1599 /*
1600 * Don't allow the seek distance to get too large from the
1601 * odd fragment, pagein, etc
1602 */
1603 if (cic->seek_samples <= 60) /* second&third seek */
1604 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1605 else
1606 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1607
1608 cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1609 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1610 total = cic->seek_total + (cic->seek_samples/2);
1611 do_div(total, cic->seek_samples);
1612 cic->seek_mean = (sector_t)total;
1613}
1510 1614
1511/* 1615/*
1512 * Disable idle window if the process thinks too long or seeks so much that 1616 * Disable idle window if the process thinks too long or seeks so much that
@@ -1619,9 +1723,11 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1619 cic = crq->io_context; 1723 cic = crq->io_context;
1620 1724
1621 cfq_update_io_thinktime(cfqd, cic); 1725 cfq_update_io_thinktime(cfqd, cic);
1726 cfq_update_io_seektime(cfqd, cic, crq);
1622 cfq_update_idle_window(cfqd, cfqq, cic); 1727 cfq_update_idle_window(cfqd, cfqq, cic);
1623 1728
1624 cic->last_queue = jiffies; 1729 cic->last_queue = jiffies;
1730 cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
1625 1731
1626 if (cfqq == cfqd->active_queue) { 1732 if (cfqq == cfqd->active_queue) {
1627 /* 1733 /*
@@ -1754,14 +1860,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
1754 cfq_resort_rr_list(cfqq, 0); 1860 cfq_resort_rr_list(cfqq, 0);
1755} 1861}
1756 1862
1757static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1758{
1759 if (rw == READ || process_sync(task))
1760 return task->pid;
1761
1762 return CFQ_KEY_ASYNC;
1763}
1764
1765static inline int 1863static inline int
1766__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1864__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1767 struct task_struct *task, int rw) 1865 struct task_struct *task, int rw)
@@ -1890,24 +1988,25 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1890 struct cfq_queue *cfqq; 1988 struct cfq_queue *cfqq;
1891 struct cfq_rq *crq; 1989 struct cfq_rq *crq;
1892 unsigned long flags; 1990 unsigned long flags;
1991 int is_sync = key != CFQ_KEY_ASYNC;
1893 1992
1894 might_sleep_if(gfp_mask & __GFP_WAIT); 1993 might_sleep_if(gfp_mask & __GFP_WAIT);
1895 1994
1896 cic = cfq_get_io_context(cfqd, key, gfp_mask); 1995 cic = cfq_get_io_context(cfqd, gfp_mask);
1897 1996
1898 spin_lock_irqsave(q->queue_lock, flags); 1997 spin_lock_irqsave(q->queue_lock, flags);
1899 1998
1900 if (!cic) 1999 if (!cic)
1901 goto queue_fail; 2000 goto queue_fail;
1902 2001
1903 if (!cic->cfqq) { 2002 if (!cic->cfqq[is_sync]) {
1904 cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask); 2003 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
1905 if (!cfqq) 2004 if (!cfqq)
1906 goto queue_fail; 2005 goto queue_fail;
1907 2006
1908 cic->cfqq = cfqq; 2007 cic->cfqq[is_sync] = cfqq;
1909 } else 2008 } else
1910 cfqq = cic->cfqq; 2009 cfqq = cic->cfqq[is_sync];
1911 2010
1912 cfqq->allocated[rw]++; 2011 cfqq->allocated[rw]++;
1913 cfq_clear_cfqq_must_alloc(cfqq); 2012 cfq_clear_cfqq_must_alloc(cfqq);
@@ -1924,7 +2023,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1924 crq->cfq_queue = cfqq; 2023 crq->cfq_queue = cfqq;
1925 crq->io_context = cic; 2024 crq->io_context = cic;
1926 2025
1927 if (rw == READ || process_sync(tsk)) 2026 if (is_sync)
1928 cfq_mark_crq_is_sync(crq); 2027 cfq_mark_crq_is_sync(crq);
1929 else 2028 else
1930 cfq_clear_crq_is_sync(crq); 2029 cfq_clear_crq_is_sync(crq);
@@ -2055,15 +2154,39 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2055 blk_sync_queue(cfqd->queue); 2154 blk_sync_queue(cfqd->queue);
2056} 2155}
2057 2156
2058static void cfq_put_cfqd(struct cfq_data *cfqd) 2157static void cfq_exit_queue(elevator_t *e)
2059{ 2158{
2159 struct cfq_data *cfqd = e->elevator_data;
2060 request_queue_t *q = cfqd->queue; 2160 request_queue_t *q = cfqd->queue;
2061 2161
2062 if (!atomic_dec_and_test(&cfqd->ref)) 2162 cfq_shutdown_timer_wq(cfqd);
2063 return; 2163
2164 write_lock(&cfq_exit_lock);
2165 spin_lock_irq(q->queue_lock);
2166
2167 if (cfqd->active_queue)
2168 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2169
2170 while (!list_empty(&cfqd->cic_list)) {
2171 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2172 struct cfq_io_context,
2173 queue_list);
2174 if (cic->cfqq[ASYNC]) {
2175 cfq_put_queue(cic->cfqq[ASYNC]);
2176 cic->cfqq[ASYNC] = NULL;
2177 }
2178 if (cic->cfqq[SYNC]) {
2179 cfq_put_queue(cic->cfqq[SYNC]);
2180 cic->cfqq[SYNC] = NULL;
2181 }
2182 cic->key = NULL;
2183 list_del_init(&cic->queue_list);
2184 }
2185
2186 spin_unlock_irq(q->queue_lock);
2187 write_unlock(&cfq_exit_lock);
2064 2188
2065 cfq_shutdown_timer_wq(cfqd); 2189 cfq_shutdown_timer_wq(cfqd);
2066 blk_put_queue(q);
2067 2190
2068 mempool_destroy(cfqd->crq_pool); 2191 mempool_destroy(cfqd->crq_pool);
2069 kfree(cfqd->crq_hash); 2192 kfree(cfqd->crq_hash);
@@ -2071,14 +2194,6 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
2071 kfree(cfqd); 2194 kfree(cfqd);
2072} 2195}
2073 2196
2074static void cfq_exit_queue(elevator_t *e)
2075{
2076 struct cfq_data *cfqd = e->elevator_data;
2077
2078 cfq_shutdown_timer_wq(cfqd);
2079 cfq_put_cfqd(cfqd);
2080}
2081
2082static int cfq_init_queue(request_queue_t *q, elevator_t *e) 2197static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2083{ 2198{
2084 struct cfq_data *cfqd; 2199 struct cfq_data *cfqd;
@@ -2097,6 +2212,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2097 INIT_LIST_HEAD(&cfqd->cur_rr); 2212 INIT_LIST_HEAD(&cfqd->cur_rr);
2098 INIT_LIST_HEAD(&cfqd->idle_rr); 2213 INIT_LIST_HEAD(&cfqd->idle_rr);
2099 INIT_LIST_HEAD(&cfqd->empty_list); 2214 INIT_LIST_HEAD(&cfqd->empty_list);
2215 INIT_LIST_HEAD(&cfqd->cic_list);
2100 2216
2101 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); 2217 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2102 if (!cfqd->crq_hash) 2218 if (!cfqd->crq_hash)
@@ -2106,7 +2222,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2106 if (!cfqd->cfq_hash) 2222 if (!cfqd->cfq_hash)
2107 goto out_cfqhash; 2223 goto out_cfqhash;
2108 2224
2109 cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool); 2225 cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
2110 if (!cfqd->crq_pool) 2226 if (!cfqd->crq_pool)
2111 goto out_crqpool; 2227 goto out_crqpool;
2112 2228
@@ -2118,7 +2234,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2118 e->elevator_data = cfqd; 2234 e->elevator_data = cfqd;
2119 2235
2120 cfqd->queue = q; 2236 cfqd->queue = q;
2121 atomic_inc(&q->refcnt);
2122 2237
2123 cfqd->max_queued = q->nr_requests / 4; 2238 cfqd->max_queued = q->nr_requests / 4;
2124 q->nr_batching = cfq_queued; 2239 q->nr_batching = cfq_queued;
@@ -2133,8 +2248,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2133 2248
2134 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q); 2249 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2135 2250
2136 atomic_set(&cfqd->ref, 1);
2137
2138 cfqd->cfq_queued = cfq_queued; 2251 cfqd->cfq_queued = cfq_queued;
2139 cfqd->cfq_quantum = cfq_quantum; 2252 cfqd->cfq_quantum = cfq_quantum;
2140 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 2253 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
@@ -2145,7 +2258,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2145 cfqd->cfq_slice[1] = cfq_slice_sync; 2258 cfqd->cfq_slice[1] = cfq_slice_sync;
2146 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 2259 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2147 cfqd->cfq_slice_idle = cfq_slice_idle; 2260 cfqd->cfq_slice_idle = cfq_slice_idle;
2148 cfqd->cfq_max_depth = cfq_max_depth;
2149 2261
2150 return 0; 2262 return 0;
2151out_crqpool: 2263out_crqpool:
@@ -2193,11 +2305,6 @@ fail:
2193/* 2305/*
2194 * sysfs parts below --> 2306 * sysfs parts below -->
2195 */ 2307 */
2196struct cfq_fs_entry {
2197 struct attribute attr;
2198 ssize_t (*show)(struct cfq_data *, char *);
2199 ssize_t (*store)(struct cfq_data *, const char *, size_t);
2200};
2201 2308
2202static ssize_t 2309static ssize_t
2203cfq_var_show(unsigned int var, char *page) 2310cfq_var_show(unsigned int var, char *page)
@@ -2215,8 +2322,9 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
2215} 2322}
2216 2323
2217#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 2324#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
2218static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ 2325static ssize_t __FUNC(elevator_t *e, char *page) \
2219{ \ 2326{ \
2327 struct cfq_data *cfqd = e->elevator_data; \
2220 unsigned int __data = __VAR; \ 2328 unsigned int __data = __VAR; \
2221 if (__CONV) \ 2329 if (__CONV) \
2222 __data = jiffies_to_msecs(__data); \ 2330 __data = jiffies_to_msecs(__data); \
@@ -2226,18 +2334,18 @@ SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2226SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); 2334SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2227SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 2335SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2228SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 2336SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2229SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0); 2337SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2230SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0); 2338SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2231SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 2339SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2232SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 2340SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2233SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 2341SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2234SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 2342SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2235SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
2236#undef SHOW_FUNCTION 2343#undef SHOW_FUNCTION
2237 2344
2238#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 2345#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
2239static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \ 2346static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \
2240{ \ 2347{ \
2348 struct cfq_data *cfqd = e->elevator_data; \
2241 unsigned int __data; \ 2349 unsigned int __data; \
2242 int ret = cfq_var_store(&__data, (page), count); \ 2350 int ret = cfq_var_store(&__data, (page), count); \
2243 if (__data < (MIN)) \ 2351 if (__data < (MIN)) \
@@ -2254,121 +2362,29 @@ STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2254STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); 2362STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2255STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); 2363STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2256STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); 2364STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2257STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 2365STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2258STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); 2366STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2259STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 2367STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2260STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 2368STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2261STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 2369STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2262STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); 2370STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2263STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
2264#undef STORE_FUNCTION 2371#undef STORE_FUNCTION
2265 2372
2266static struct cfq_fs_entry cfq_quantum_entry = { 2373#define CFQ_ATTR(name) \
2267 .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR }, 2374 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2268 .show = cfq_quantum_show, 2375
2269 .store = cfq_quantum_store, 2376static struct elv_fs_entry cfq_attrs[] = {
2270}; 2377 CFQ_ATTR(quantum),
2271static struct cfq_fs_entry cfq_queued_entry = { 2378 CFQ_ATTR(queued),
2272 .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR }, 2379 CFQ_ATTR(fifo_expire_sync),
2273 .show = cfq_queued_show, 2380 CFQ_ATTR(fifo_expire_async),
2274 .store = cfq_queued_store, 2381 CFQ_ATTR(back_seek_max),
2275}; 2382 CFQ_ATTR(back_seek_penalty),
2276static struct cfq_fs_entry cfq_fifo_expire_sync_entry = { 2383 CFQ_ATTR(slice_sync),
2277 .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR }, 2384 CFQ_ATTR(slice_async),
2278 .show = cfq_fifo_expire_sync_show, 2385 CFQ_ATTR(slice_async_rq),
2279 .store = cfq_fifo_expire_sync_store, 2386 CFQ_ATTR(slice_idle),
2280}; 2387 __ATTR_NULL
2281static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
2282 .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
2283 .show = cfq_fifo_expire_async_show,
2284 .store = cfq_fifo_expire_async_store,
2285};
2286static struct cfq_fs_entry cfq_back_max_entry = {
2287 .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
2288 .show = cfq_back_max_show,
2289 .store = cfq_back_max_store,
2290};
2291static struct cfq_fs_entry cfq_back_penalty_entry = {
2292 .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
2293 .show = cfq_back_penalty_show,
2294 .store = cfq_back_penalty_store,
2295};
2296static struct cfq_fs_entry cfq_slice_sync_entry = {
2297 .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
2298 .show = cfq_slice_sync_show,
2299 .store = cfq_slice_sync_store,
2300};
2301static struct cfq_fs_entry cfq_slice_async_entry = {
2302 .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
2303 .show = cfq_slice_async_show,
2304 .store = cfq_slice_async_store,
2305};
2306static struct cfq_fs_entry cfq_slice_async_rq_entry = {
2307 .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
2308 .show = cfq_slice_async_rq_show,
2309 .store = cfq_slice_async_rq_store,
2310};
2311static struct cfq_fs_entry cfq_slice_idle_entry = {
2312 .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
2313 .show = cfq_slice_idle_show,
2314 .store = cfq_slice_idle_store,
2315};
2316static struct cfq_fs_entry cfq_max_depth_entry = {
2317 .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
2318 .show = cfq_max_depth_show,
2319 .store = cfq_max_depth_store,
2320};
2321
2322static struct attribute *default_attrs[] = {
2323 &cfq_quantum_entry.attr,
2324 &cfq_queued_entry.attr,
2325 &cfq_fifo_expire_sync_entry.attr,
2326 &cfq_fifo_expire_async_entry.attr,
2327 &cfq_back_max_entry.attr,
2328 &cfq_back_penalty_entry.attr,
2329 &cfq_slice_sync_entry.attr,
2330 &cfq_slice_async_entry.attr,
2331 &cfq_slice_async_rq_entry.attr,
2332 &cfq_slice_idle_entry.attr,
2333 &cfq_max_depth_entry.attr,
2334 NULL,
2335};
2336
2337#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
2338
2339static ssize_t
2340cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2341{
2342 elevator_t *e = container_of(kobj, elevator_t, kobj);
2343 struct cfq_fs_entry *entry = to_cfq(attr);
2344
2345 if (!entry->show)
2346 return -EIO;
2347
2348 return entry->show(e->elevator_data, page);
2349}
2350
2351static ssize_t
2352cfq_attr_store(struct kobject *kobj, struct attribute *attr,
2353 const char *page, size_t length)
2354{
2355 elevator_t *e = container_of(kobj, elevator_t, kobj);
2356 struct cfq_fs_entry *entry = to_cfq(attr);
2357
2358 if (!entry->store)
2359 return -EIO;
2360
2361 return entry->store(e->elevator_data, page, length);
2362}
2363
2364static struct sysfs_ops cfq_sysfs_ops = {
2365 .show = cfq_attr_show,
2366 .store = cfq_attr_store,
2367};
2368
2369static struct kobj_type cfq_ktype = {
2370 .sysfs_ops = &cfq_sysfs_ops,
2371 .default_attrs = default_attrs,
2372}; 2388};
2373 2389
2374static struct elevator_type iosched_cfq = { 2390static struct elevator_type iosched_cfq = {
@@ -2389,8 +2405,9 @@ static struct elevator_type iosched_cfq = {
2389 .elevator_may_queue_fn = cfq_may_queue, 2405 .elevator_may_queue_fn = cfq_may_queue,
2390 .elevator_init_fn = cfq_init_queue, 2406 .elevator_init_fn = cfq_init_queue,
2391 .elevator_exit_fn = cfq_exit_queue, 2407 .elevator_exit_fn = cfq_exit_queue,
2408 .trim = cfq_trim,
2392 }, 2409 },
2393 .elevator_ktype = &cfq_ktype, 2410 .elevator_attrs = cfq_attrs,
2394 .elevator_name = "cfq", 2411 .elevator_name = "cfq",
2395 .elevator_owner = THIS_MODULE, 2412 .elevator_owner = THIS_MODULE,
2396}; 2413};
@@ -2419,7 +2436,13 @@ static int __init cfq_init(void)
2419 2436
2420static void __exit cfq_exit(void) 2437static void __exit cfq_exit(void)
2421{ 2438{
2439 DECLARE_COMPLETION(all_gone);
2422 elv_unregister(&iosched_cfq); 2440 elv_unregister(&iosched_cfq);
2441 ioc_gone = &all_gone;
2442 barrier();
2443 if (atomic_read(&ioc_count))
2444 complete(ioc_gone);
2445 synchronize_rcu();
2423 cfq_slab_kill(); 2446 cfq_slab_kill();
2424} 2447}
2425 2448
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 27e494b1bf..399fa1e60e 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -694,11 +694,6 @@ deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
694/* 694/*
695 * sysfs parts below 695 * sysfs parts below
696 */ 696 */
697struct deadline_fs_entry {
698 struct attribute attr;
699 ssize_t (*show)(struct deadline_data *, char *);
700 ssize_t (*store)(struct deadline_data *, const char *, size_t);
701};
702 697
703static ssize_t 698static ssize_t
704deadline_var_show(int var, char *page) 699deadline_var_show(int var, char *page)
@@ -716,23 +711,25 @@ deadline_var_store(int *var, const char *page, size_t count)
716} 711}
717 712
718#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 713#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
719static ssize_t __FUNC(struct deadline_data *dd, char *page) \ 714static ssize_t __FUNC(elevator_t *e, char *page) \
720{ \ 715{ \
721 int __data = __VAR; \ 716 struct deadline_data *dd = e->elevator_data; \
717 int __data = __VAR; \
722 if (__CONV) \ 718 if (__CONV) \
723 __data = jiffies_to_msecs(__data); \ 719 __data = jiffies_to_msecs(__data); \
724 return deadline_var_show(__data, (page)); \ 720 return deadline_var_show(__data, (page)); \
725} 721}
726SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1); 722SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
727SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1); 723SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
728SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0); 724SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
729SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0); 725SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
730SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0); 726SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
731#undef SHOW_FUNCTION 727#undef SHOW_FUNCTION
732 728
733#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 729#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
734static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count) \ 730static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \
735{ \ 731{ \
732 struct deadline_data *dd = e->elevator_data; \
736 int __data; \ 733 int __data; \
737 int ret = deadline_var_store(&__data, (page), count); \ 734 int ret = deadline_var_store(&__data, (page), count); \
738 if (__data < (MIN)) \ 735 if (__data < (MIN)) \
@@ -745,83 +742,24 @@ static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)
745 *(__PTR) = __data; \ 742 *(__PTR) = __data; \
746 return ret; \ 743 return ret; \
747} 744}
748STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); 745STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
749STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); 746STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
750STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); 747STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
751STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0); 748STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
752STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0); 749STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
753#undef STORE_FUNCTION 750#undef STORE_FUNCTION
754 751
755static struct deadline_fs_entry deadline_readexpire_entry = { 752#define DD_ATTR(name) \
756 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, 753 __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
757 .show = deadline_readexpire_show, 754 deadline_##name##_store)
758 .store = deadline_readexpire_store, 755
759}; 756static struct elv_fs_entry deadline_attrs[] = {
760static struct deadline_fs_entry deadline_writeexpire_entry = { 757 DD_ATTR(read_expire),
761 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR }, 758 DD_ATTR(write_expire),
762 .show = deadline_writeexpire_show, 759 DD_ATTR(writes_starved),
763 .store = deadline_writeexpire_store, 760 DD_ATTR(front_merges),
764}; 761 DD_ATTR(fifo_batch),
765static struct deadline_fs_entry deadline_writesstarved_entry = { 762 __ATTR_NULL
766 .attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR },
767 .show = deadline_writesstarved_show,
768 .store = deadline_writesstarved_store,
769};
770static struct deadline_fs_entry deadline_frontmerges_entry = {
771 .attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR },
772 .show = deadline_frontmerges_show,
773 .store = deadline_frontmerges_store,
774};
775static struct deadline_fs_entry deadline_fifobatch_entry = {
776 .attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR },
777 .show = deadline_fifobatch_show,
778 .store = deadline_fifobatch_store,
779};
780
781static struct attribute *default_attrs[] = {
782 &deadline_readexpire_entry.attr,
783 &deadline_writeexpire_entry.attr,
784 &deadline_writesstarved_entry.attr,
785 &deadline_frontmerges_entry.attr,
786 &deadline_fifobatch_entry.attr,
787 NULL,
788};
789
790#define to_deadline(atr) container_of((atr), struct deadline_fs_entry, attr)
791
792static ssize_t
793deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
794{
795 elevator_t *e = container_of(kobj, elevator_t, kobj);
796 struct deadline_fs_entry *entry = to_deadline(attr);
797
798 if (!entry->show)
799 return -EIO;
800
801 return entry->show(e->elevator_data, page);
802}
803
804static ssize_t
805deadline_attr_store(struct kobject *kobj, struct attribute *attr,
806 const char *page, size_t length)
807{
808 elevator_t *e = container_of(kobj, elevator_t, kobj);
809 struct deadline_fs_entry *entry = to_deadline(attr);
810
811 if (!entry->store)
812 return -EIO;
813
814 return entry->store(e->elevator_data, page, length);
815}
816
817static struct sysfs_ops deadline_sysfs_ops = {
818 .show = deadline_attr_show,
819 .store = deadline_attr_store,
820};
821
822static struct kobj_type deadline_ktype = {
823 .sysfs_ops = &deadline_sysfs_ops,
824 .default_attrs = default_attrs,
825}; 763};
826 764
827static struct elevator_type iosched_deadline = { 765static struct elevator_type iosched_deadline = {
@@ -840,7 +778,7 @@ static struct elevator_type iosched_deadline = {
840 .elevator_exit_fn = deadline_exit_queue, 778 .elevator_exit_fn = deadline_exit_queue,
841 }, 779 },
842 780
843 .elevator_ktype = &deadline_ktype, 781 .elevator_attrs = deadline_attrs,
844 .elevator_name = "deadline", 782 .elevator_name = "deadline",
845 .elevator_owner = THIS_MODULE, 783 .elevator_owner = THIS_MODULE,
846}; 784};
diff --git a/block/elevator.c b/block/elevator.c
index 24b702d649..56c2ed06a9 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/blktrace_api.h>
36 37
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38 39
@@ -120,15 +121,10 @@ static struct elevator_type *elevator_get(const char *name)
120 return e; 121 return e;
121} 122}
122 123
123static int elevator_attach(request_queue_t *q, struct elevator_type *e, 124static int elevator_attach(request_queue_t *q, struct elevator_queue *eq)
124 struct elevator_queue *eq)
125{ 125{
126 int ret = 0; 126 int ret = 0;
127 127
128 memset(eq, 0, sizeof(*eq));
129 eq->ops = &e->ops;
130 eq->elevator_type = e;
131
132 q->elevator = eq; 128 q->elevator = eq;
133 129
134 if (eq->ops->elevator_init_fn) 130 if (eq->ops->elevator_init_fn)
@@ -154,6 +150,32 @@ static int __init elevator_setup(char *str)
154 150
155__setup("elevator=", elevator_setup); 151__setup("elevator=", elevator_setup);
156 152
153static struct kobj_type elv_ktype;
154
155static elevator_t *elevator_alloc(struct elevator_type *e)
156{
157 elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL);
158 if (eq) {
159 memset(eq, 0, sizeof(*eq));
160 eq->ops = &e->ops;
161 eq->elevator_type = e;
162 kobject_init(&eq->kobj);
163 snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
164 eq->kobj.ktype = &elv_ktype;
165 mutex_init(&eq->sysfs_lock);
166 } else {
167 elevator_put(e);
168 }
169 return eq;
170}
171
172static void elevator_release(struct kobject *kobj)
173{
174 elevator_t *e = container_of(kobj, elevator_t, kobj);
175 elevator_put(e->elevator_type);
176 kfree(e);
177}
178
157int elevator_init(request_queue_t *q, char *name) 179int elevator_init(request_queue_t *q, char *name)
158{ 180{
159 struct elevator_type *e = NULL; 181 struct elevator_type *e = NULL;
@@ -176,29 +198,26 @@ int elevator_init(request_queue_t *q, char *name)
176 e = elevator_get("noop"); 198 e = elevator_get("noop");
177 } 199 }
178 200
179 eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL); 201 eq = elevator_alloc(e);
180 if (!eq) { 202 if (!eq)
181 elevator_put(e);
182 return -ENOMEM; 203 return -ENOMEM;
183 }
184 204
185 ret = elevator_attach(q, e, eq); 205 ret = elevator_attach(q, eq);
186 if (ret) { 206 if (ret)
187 kfree(eq); 207 kobject_put(&eq->kobj);
188 elevator_put(e);
189 }
190 208
191 return ret; 209 return ret;
192} 210}
193 211
194void elevator_exit(elevator_t *e) 212void elevator_exit(elevator_t *e)
195{ 213{
214 mutex_lock(&e->sysfs_lock);
196 if (e->ops->elevator_exit_fn) 215 if (e->ops->elevator_exit_fn)
197 e->ops->elevator_exit_fn(e); 216 e->ops->elevator_exit_fn(e);
217 e->ops = NULL;
218 mutex_unlock(&e->sysfs_lock);
198 219
199 elevator_put(e->elevator_type); 220 kobject_put(&e->kobj);
200 e->elevator_type = NULL;
201 kfree(e);
202} 221}
203 222
204/* 223/*
@@ -315,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
315 struct list_head *pos; 334 struct list_head *pos;
316 unsigned ordseq; 335 unsigned ordseq;
317 336
337 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
338
318 rq->q = q; 339 rq->q = q;
319 340
320 switch (where) { 341 switch (where) {
@@ -481,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
481 * not be passed by new incoming requests 502 * not be passed by new incoming requests
482 */ 503 */
483 rq->flags |= REQ_STARTED; 504 rq->flags |= REQ_STARTED;
505 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
484 } 506 }
485 507
486 if (!q->boundary_rq || q->boundary_rq == rq) { 508 if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -627,34 +649,86 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
627 } 649 }
628} 650}
629 651
630int elv_register_queue(struct request_queue *q) 652#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
653
654static ssize_t
655elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
631{ 656{
632 elevator_t *e = q->elevator; 657 elevator_t *e = container_of(kobj, elevator_t, kobj);
658 struct elv_fs_entry *entry = to_elv(attr);
659 ssize_t error;
633 660
634 e->kobj.parent = kobject_get(&q->kobj); 661 if (!entry->show)
635 if (!e->kobj.parent) 662 return -EIO;
636 return -EBUSY;
637 663
638 snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched"); 664 mutex_lock(&e->sysfs_lock);
639 e->kobj.ktype = e->elevator_type->elevator_ktype; 665 error = e->ops ? entry->show(e, page) : -ENOENT;
666 mutex_unlock(&e->sysfs_lock);
667 return error;
668}
669
670static ssize_t
671elv_attr_store(struct kobject *kobj, struct attribute *attr,
672 const char *page, size_t length)
673{
674 elevator_t *e = container_of(kobj, elevator_t, kobj);
675 struct elv_fs_entry *entry = to_elv(attr);
676 ssize_t error;
640 677
641 return kobject_register(&e->kobj); 678 if (!entry->store)
679 return -EIO;
680
681 mutex_lock(&e->sysfs_lock);
682 error = e->ops ? entry->store(e, page, length) : -ENOENT;
683 mutex_unlock(&e->sysfs_lock);
684 return error;
685}
686
687static struct sysfs_ops elv_sysfs_ops = {
688 .show = elv_attr_show,
689 .store = elv_attr_store,
690};
691
692static struct kobj_type elv_ktype = {
693 .sysfs_ops = &elv_sysfs_ops,
694 .release = elevator_release,
695};
696
697int elv_register_queue(struct request_queue *q)
698{
699 elevator_t *e = q->elevator;
700 int error;
701
702 e->kobj.parent = &q->kobj;
703
704 error = kobject_add(&e->kobj);
705 if (!error) {
706 struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
707 if (attr) {
708 while (attr->attr.name) {
709 if (sysfs_create_file(&e->kobj, &attr->attr))
710 break;
711 attr++;
712 }
713 }
714 kobject_uevent(&e->kobj, KOBJ_ADD);
715 }
716 return error;
642} 717}
643 718
644void elv_unregister_queue(struct request_queue *q) 719void elv_unregister_queue(struct request_queue *q)
645{ 720{
646 if (q) { 721 if (q) {
647 elevator_t *e = q->elevator; 722 elevator_t *e = q->elevator;
648 kobject_unregister(&e->kobj); 723 kobject_uevent(&e->kobj, KOBJ_REMOVE);
649 kobject_put(&q->kobj); 724 kobject_del(&e->kobj);
650 } 725 }
651} 726}
652 727
653int elv_register(struct elevator_type *e) 728int elv_register(struct elevator_type *e)
654{ 729{
655 spin_lock_irq(&elv_list_lock); 730 spin_lock_irq(&elv_list_lock);
656 if (elevator_find(e->elevator_name)) 731 BUG_ON(elevator_find(e->elevator_name));
657 BUG();
658 list_add_tail(&e->list, &elv_list); 732 list_add_tail(&e->list, &elv_list);
659 spin_unlock_irq(&elv_list_lock); 733 spin_unlock_irq(&elv_list_lock);
660 734
@@ -675,21 +749,15 @@ void elv_unregister(struct elevator_type *e)
675 /* 749 /*
676 * Iterate every thread in the process to remove the io contexts. 750 * Iterate every thread in the process to remove the io contexts.
677 */ 751 */
678 read_lock(&tasklist_lock); 752 if (e->ops.trim) {
679 do_each_thread(g, p) { 753 read_lock(&tasklist_lock);
680 struct io_context *ioc = p->io_context; 754 do_each_thread(g, p) {
681 if (ioc && ioc->cic) { 755 task_lock(p);
682 ioc->cic->exit(ioc->cic); 756 e->ops.trim(p->io_context);
683 ioc->cic->dtor(ioc->cic); 757 task_unlock(p);
684 ioc->cic = NULL; 758 } while_each_thread(g, p);
685 } 759 read_unlock(&tasklist_lock);
686 if (ioc && ioc->aic) { 760 }
687 ioc->aic->exit(ioc->aic);
688 ioc->aic->dtor(ioc->aic);
689 ioc->aic = NULL;
690 }
691 } while_each_thread(g, p);
692 read_unlock(&tasklist_lock);
693 761
694 spin_lock_irq(&elv_list_lock); 762 spin_lock_irq(&elv_list_lock);
695 list_del_init(&e->list); 763 list_del_init(&e->list);
@@ -703,16 +771,16 @@ EXPORT_SYMBOL_GPL(elv_unregister);
703 * need for the new one. this way we have a chance of going back to the old 771 * need for the new one. this way we have a chance of going back to the old
704 * one, if the new one fails init for some reason. 772 * one, if the new one fails init for some reason.
705 */ 773 */
706static void elevator_switch(request_queue_t *q, struct elevator_type *new_e) 774static int elevator_switch(request_queue_t *q, struct elevator_type *new_e)
707{ 775{
708 elevator_t *old_elevator, *e; 776 elevator_t *old_elevator, *e;
709 777
710 /* 778 /*
711 * Allocate new elevator 779 * Allocate new elevator
712 */ 780 */
713 e = kmalloc(sizeof(elevator_t), GFP_KERNEL); 781 e = elevator_alloc(new_e);
714 if (!e) 782 if (!e)
715 goto error; 783 return 0;
716 784
717 /* 785 /*
718 * Turn on BYPASS and drain all requests w/ elevator private data 786 * Turn on BYPASS and drain all requests w/ elevator private data
@@ -743,7 +811,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
743 /* 811 /*
744 * attach and start new elevator 812 * attach and start new elevator
745 */ 813 */
746 if (elevator_attach(q, new_e, e)) 814 if (elevator_attach(q, e))
747 goto fail; 815 goto fail;
748 816
749 if (elv_register_queue(q)) 817 if (elv_register_queue(q))
@@ -754,7 +822,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
754 */ 822 */
755 elevator_exit(old_elevator); 823 elevator_exit(old_elevator);
756 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 824 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
757 return; 825 return 1;
758 826
759fail_register: 827fail_register:
760 /* 828 /*
@@ -767,10 +835,9 @@ fail:
767 q->elevator = old_elevator; 835 q->elevator = old_elevator;
768 elv_register_queue(q); 836 elv_register_queue(q);
769 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 837 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
770 kfree(e); 838 if (e)
771error: 839 kobject_put(&e->kobj);
772 elevator_put(new_e); 840 return 0;
773 printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
774} 841}
775 842
776ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count) 843ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
@@ -797,7 +864,8 @@ ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
797 return count; 864 return count;
798 } 865 }
799 866
800 elevator_switch(q, e); 867 if (!elevator_switch(q, e))
868 printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name);
801 return count; 869 return count;
802} 870}
803 871
diff --git a/block/genhd.c b/block/genhd.c
index db57546a70..db4c60c802 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -15,12 +15,13 @@
15#include <linux/kmod.h> 15#include <linux/kmod.h>
16#include <linux/kobj_map.h> 16#include <linux/kobj_map.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/mutex.h>
18 19
19#define MAX_PROBE_HASH 255 /* random */ 20#define MAX_PROBE_HASH 255 /* random */
20 21
21static struct subsystem block_subsys; 22static struct subsystem block_subsys;
22 23
23static DECLARE_MUTEX(block_subsys_sem); 24static DEFINE_MUTEX(block_subsys_lock);
24 25
25/* 26/*
26 * Can be deleted altogether. Later. 27 * Can be deleted altogether. Later.
@@ -46,7 +47,7 @@ struct blkdev_info {
46/* 47/*
47 * iterate over a list of blkdev_info structures. allows 48 * iterate over a list of blkdev_info structures. allows
48 * the major_names array to be iterated over from outside this file 49 * the major_names array to be iterated over from outside this file
49 * must be called with the block_subsys_sem held 50 * must be called with the block_subsys_lock held
50 */ 51 */
51void *get_next_blkdev(void *dev) 52void *get_next_blkdev(void *dev)
52{ 53{
@@ -85,20 +86,20 @@ out:
85 86
86void *acquire_blkdev_list(void) 87void *acquire_blkdev_list(void)
87{ 88{
88 down(&block_subsys_sem); 89 mutex_lock(&block_subsys_lock);
89 return get_next_blkdev(NULL); 90 return get_next_blkdev(NULL);
90} 91}
91 92
92void release_blkdev_list(void *dev) 93void release_blkdev_list(void *dev)
93{ 94{
94 up(&block_subsys_sem); 95 mutex_unlock(&block_subsys_lock);
95 kfree(dev); 96 kfree(dev);
96} 97}
97 98
98 99
99/* 100/*
100 * Count the number of records in the blkdev_list. 101 * Count the number of records in the blkdev_list.
101 * must be called with the block_subsys_sem held 102 * must be called with the block_subsys_lock held
102 */ 103 */
103int count_blkdev_list(void) 104int count_blkdev_list(void)
104{ 105{
@@ -118,7 +119,7 @@ int count_blkdev_list(void)
118/* 119/*
119 * extract the major and name values from a blkdev_info struct 120 * extract the major and name values from a blkdev_info struct
120 * passed in as a void to *dev. Must be called with 121 * passed in as a void to *dev. Must be called with
121 * block_subsys_sem held 122 * block_subsys_lock held
122 */ 123 */
123int get_blkdev_info(void *dev, int *major, char **name) 124int get_blkdev_info(void *dev, int *major, char **name)
124{ 125{
@@ -138,7 +139,7 @@ int register_blkdev(unsigned int major, const char *name)
138 struct blk_major_name **n, *p; 139 struct blk_major_name **n, *p;
139 int index, ret = 0; 140 int index, ret = 0;
140 141
141 down(&block_subsys_sem); 142 mutex_lock(&block_subsys_lock);
142 143
143 /* temporary */ 144 /* temporary */
144 if (major == 0) { 145 if (major == 0) {
@@ -183,7 +184,7 @@ int register_blkdev(unsigned int major, const char *name)
183 kfree(p); 184 kfree(p);
184 } 185 }
185out: 186out:
186 up(&block_subsys_sem); 187 mutex_unlock(&block_subsys_lock);
187 return ret; 188 return ret;
188} 189}
189 190
@@ -197,7 +198,7 @@ int unregister_blkdev(unsigned int major, const char *name)
197 int index = major_to_index(major); 198 int index = major_to_index(major);
198 int ret = 0; 199 int ret = 0;
199 200
200 down(&block_subsys_sem); 201 mutex_lock(&block_subsys_lock);
201 for (n = &major_names[index]; *n; n = &(*n)->next) 202 for (n = &major_names[index]; *n; n = &(*n)->next)
202 if ((*n)->major == major) 203 if ((*n)->major == major)
203 break; 204 break;
@@ -207,7 +208,7 @@ int unregister_blkdev(unsigned int major, const char *name)
207 p = *n; 208 p = *n;
208 *n = p->next; 209 *n = p->next;
209 } 210 }
210 up(&block_subsys_sem); 211 mutex_unlock(&block_subsys_lock);
211 kfree(p); 212 kfree(p);
212 213
213 return ret; 214 return ret;
@@ -301,7 +302,7 @@ static void *part_start(struct seq_file *part, loff_t *pos)
301 struct list_head *p; 302 struct list_head *p;
302 loff_t l = *pos; 303 loff_t l = *pos;
303 304
304 down(&block_subsys_sem); 305 mutex_lock(&block_subsys_lock);
305 list_for_each(p, &block_subsys.kset.list) 306 list_for_each(p, &block_subsys.kset.list)
306 if (!l--) 307 if (!l--)
307 return list_entry(p, struct gendisk, kobj.entry); 308 return list_entry(p, struct gendisk, kobj.entry);
@@ -318,7 +319,7 @@ static void *part_next(struct seq_file *part, void *v, loff_t *pos)
318 319
319static void part_stop(struct seq_file *part, void *v) 320static void part_stop(struct seq_file *part, void *v)
320{ 321{
321 up(&block_subsys_sem); 322 mutex_unlock(&block_subsys_lock);
322} 323}
323 324
324static int show_partition(struct seq_file *part, void *v) 325static int show_partition(struct seq_file *part, void *v)
@@ -377,7 +378,7 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
377 378
378static int __init genhd_device_init(void) 379static int __init genhd_device_init(void)
379{ 380{
380 bdev_map = kobj_map_init(base_probe, &block_subsys_sem); 381 bdev_map = kobj_map_init(base_probe, &block_subsys_lock);
381 blk_dev_init(); 382 blk_dev_init();
382 subsystem_register(&block_subsys); 383 subsystem_register(&block_subsys);
383 return 0; 384 return 0;
@@ -453,8 +454,8 @@ static ssize_t disk_stats_read(struct gendisk * disk, char *page)
453 disk_round_stats(disk); 454 disk_round_stats(disk);
454 preempt_enable(); 455 preempt_enable();
455 return sprintf(page, 456 return sprintf(page,
456 "%8u %8u %8llu %8u " 457 "%8lu %8lu %8llu %8u "
457 "%8u %8u %8llu %8u " 458 "%8lu %8lu %8llu %8u "
458 "%8u %8u %8u" 459 "%8u %8u %8u"
459 "\n", 460 "\n",
460 disk_stat_read(disk, ios[READ]), 461 disk_stat_read(disk, ios[READ]),
@@ -611,7 +612,7 @@ static void *diskstats_start(struct seq_file *part, loff_t *pos)
611 loff_t k = *pos; 612 loff_t k = *pos;
612 struct list_head *p; 613 struct list_head *p;
613 614
614 down(&block_subsys_sem); 615 mutex_lock(&block_subsys_lock);
615 list_for_each(p, &block_subsys.kset.list) 616 list_for_each(p, &block_subsys.kset.list)
616 if (!k--) 617 if (!k--)
617 return list_entry(p, struct gendisk, kobj.entry); 618 return list_entry(p, struct gendisk, kobj.entry);
@@ -628,7 +629,7 @@ static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
628 629
629static void diskstats_stop(struct seq_file *part, void *v) 630static void diskstats_stop(struct seq_file *part, void *v)
630{ 631{
631 up(&block_subsys_sem); 632 mutex_unlock(&block_subsys_lock);
632} 633}
633 634
634static int diskstats_show(struct seq_file *s, void *v) 635static int diskstats_show(struct seq_file *s, void *v)
@@ -648,7 +649,7 @@ static int diskstats_show(struct seq_file *s, void *v)
648 preempt_disable(); 649 preempt_disable();
649 disk_round_stats(gp); 650 disk_round_stats(gp);
650 preempt_enable(); 651 preempt_enable();
651 seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n", 652 seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
652 gp->major, n + gp->first_minor, disk_name(gp, n, buf), 653 gp->major, n + gp->first_minor, disk_name(gp, n, buf),
653 disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), 654 disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
654 (unsigned long long)disk_stat_read(gp, sectors[0]), 655 (unsigned long long)disk_stat_read(gp, sectors[0]),
diff --git a/block/ioctl.c b/block/ioctl.c
index e1109491c2..9cfa2e1ecb 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
5#include <linux/backing-dev.h> 5#include <linux/backing-dev.h>
6#include <linux/buffer_head.h> 6#include <linux/buffer_head.h>
7#include <linux/smp_lock.h> 7#include <linux/smp_lock.h>
8#include <linux/blktrace_api.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
9 10
10static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) 11static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -42,9 +43,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
42 return -EINVAL; 43 return -EINVAL;
43 } 44 }
44 /* partition number in use? */ 45 /* partition number in use? */
45 down(&bdev->bd_sem); 46 mutex_lock(&bdev->bd_mutex);
46 if (disk->part[part - 1]) { 47 if (disk->part[part - 1]) {
47 up(&bdev->bd_sem); 48 mutex_unlock(&bdev->bd_mutex);
48 return -EBUSY; 49 return -EBUSY;
49 } 50 }
50 /* overlap? */ 51 /* overlap? */
@@ -55,13 +56,13 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
55 continue; 56 continue;
56 if (!(start+length <= s->start_sect || 57 if (!(start+length <= s->start_sect ||
57 start >= s->start_sect + s->nr_sects)) { 58 start >= s->start_sect + s->nr_sects)) {
58 up(&bdev->bd_sem); 59 mutex_unlock(&bdev->bd_mutex);
59 return -EBUSY; 60 return -EBUSY;
60 } 61 }
61 } 62 }
62 /* all seems OK */ 63 /* all seems OK */
63 add_partition(disk, part, start, length); 64 add_partition(disk, part, start, length);
64 up(&bdev->bd_sem); 65 mutex_unlock(&bdev->bd_mutex);
65 return 0; 66 return 0;
66 case BLKPG_DEL_PARTITION: 67 case BLKPG_DEL_PARTITION:
67 if (!disk->part[part-1]) 68 if (!disk->part[part-1])
@@ -71,9 +72,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
71 bdevp = bdget_disk(disk, part); 72 bdevp = bdget_disk(disk, part);
72 if (!bdevp) 73 if (!bdevp)
73 return -ENOMEM; 74 return -ENOMEM;
74 down(&bdevp->bd_sem); 75 mutex_lock(&bdevp->bd_mutex);
75 if (bdevp->bd_openers) { 76 if (bdevp->bd_openers) {
76 up(&bdevp->bd_sem); 77 mutex_unlock(&bdevp->bd_mutex);
77 bdput(bdevp); 78 bdput(bdevp);
78 return -EBUSY; 79 return -EBUSY;
79 } 80 }
@@ -81,10 +82,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
81 fsync_bdev(bdevp); 82 fsync_bdev(bdevp);
82 invalidate_bdev(bdevp, 0); 83 invalidate_bdev(bdevp, 0);
83 84
84 down(&bdev->bd_sem); 85 mutex_lock(&bdev->bd_mutex);
85 delete_partition(disk, part); 86 delete_partition(disk, part);
86 up(&bdev->bd_sem); 87 mutex_unlock(&bdev->bd_mutex);
87 up(&bdevp->bd_sem); 88 mutex_unlock(&bdevp->bd_mutex);
88 bdput(bdevp); 89 bdput(bdevp);
89 90
90 return 0; 91 return 0;
@@ -102,10 +103,10 @@ static int blkdev_reread_part(struct block_device *bdev)
102 return -EINVAL; 103 return -EINVAL;
103 if (!capable(CAP_SYS_ADMIN)) 104 if (!capable(CAP_SYS_ADMIN))
104 return -EACCES; 105 return -EACCES;
105 if (down_trylock(&bdev->bd_sem)) 106 if (!mutex_trylock(&bdev->bd_mutex))
106 return -EBUSY; 107 return -EBUSY;
107 res = rescan_partitions(disk, bdev); 108 res = rescan_partitions(disk, bdev);
108 up(&bdev->bd_sem); 109 mutex_unlock(&bdev->bd_mutex);
109 return res; 110 return res;
110} 111}
111 112
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
189 return put_ulong(arg, bdev->bd_inode->i_size >> 9); 190 return put_ulong(arg, bdev->bd_inode->i_size >> 9);
190 case BLKGETSIZE64: 191 case BLKGETSIZE64:
191 return put_u64(arg, bdev->bd_inode->i_size); 192 return put_u64(arg, bdev->bd_inode->i_size);
193 case BLKTRACESTART:
194 case BLKTRACESTOP:
195 case BLKTRACESETUP:
196 case BLKTRACETEARDOWN:
197 return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
192 } 198 }
193 return -ENOIOCTLCMD; 199 return -ENOIOCTLCMD;
194} 200}
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 0ef2971a9e..5b26af8597 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
30#include <linux/cpu.h> 30#include <linux/cpu.h>
31#include <linux/blktrace_api.h>
31 32
32/* 33/*
33 * for max sense size 34 * for max sense size
@@ -784,6 +785,8 @@ void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
784 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); 785 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
785 t->max_segment_size = min(t->max_segment_size,b->max_segment_size); 786 t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
786 t->hardsect_size = max(t->hardsect_size,b->hardsect_size); 787 t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
788 if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
789 clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
787} 790}
788 791
789EXPORT_SYMBOL(blk_queue_stack_limits); 792EXPORT_SYMBOL(blk_queue_stack_limits);
@@ -905,17 +908,15 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
905 __FUNCTION__, depth); 908 __FUNCTION__, depth);
906 } 909 }
907 910
908 tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); 911 tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
909 if (!tag_index) 912 if (!tag_index)
910 goto fail; 913 goto fail;
911 914
912 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; 915 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
913 tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); 916 tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
914 if (!tag_map) 917 if (!tag_map)
915 goto fail; 918 goto fail;
916 919
917 memset(tag_index, 0, depth * sizeof(struct request *));
918 memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
919 tags->real_max_depth = depth; 920 tags->real_max_depth = depth;
920 tags->max_depth = depth; 921 tags->max_depth = depth;
921 tags->tag_index = tag_index; 922 tags->tag_index = tag_index;
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
1556 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) 1557 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
1557 return; 1558 return;
1558 1559
1559 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1560 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
1560 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1561 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1562 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
1563 }
1561} 1564}
1562 1565
1563EXPORT_SYMBOL(blk_plug_device); 1566EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1621 /* 1624 /*
1622 * devices don't necessarily have an ->unplug_fn defined 1625 * devices don't necessarily have an ->unplug_fn defined
1623 */ 1626 */
1624 if (q->unplug_fn) 1627 if (q->unplug_fn) {
1628 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1629 q->rq.count[READ] + q->rq.count[WRITE]);
1630
1625 q->unplug_fn(q); 1631 q->unplug_fn(q);
1632 }
1626} 1633}
1627 1634
1628static void blk_unplug_work(void *data) 1635static void blk_unplug_work(void *data)
1629{ 1636{
1630 request_queue_t *q = data; 1637 request_queue_t *q = data;
1631 1638
1639 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1640 q->rq.count[READ] + q->rq.count[WRITE]);
1641
1632 q->unplug_fn(q); 1642 q->unplug_fn(q);
1633} 1643}
1634 1644
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
1636{ 1646{
1637 request_queue_t *q = (request_queue_t *)data; 1647 request_queue_t *q = (request_queue_t *)data;
1638 1648
1649 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
1650 q->rq.count[READ] + q->rq.count[WRITE]);
1651
1639 kblockd_schedule_work(&q->unplug_work); 1652 kblockd_schedule_work(&q->unplug_work);
1640} 1653}
1641 1654
@@ -1740,16 +1753,11 @@ EXPORT_SYMBOL(blk_run_queue);
1740 * Hopefully the low level driver will have finished any 1753 * Hopefully the low level driver will have finished any
1741 * outstanding requests first... 1754 * outstanding requests first...
1742 **/ 1755 **/
1743void blk_cleanup_queue(request_queue_t * q) 1756static void blk_release_queue(struct kobject *kobj)
1744{ 1757{
1758 request_queue_t *q = container_of(kobj, struct request_queue, kobj);
1745 struct request_list *rl = &q->rq; 1759 struct request_list *rl = &q->rq;
1746 1760
1747 if (!atomic_dec_and_test(&q->refcnt))
1748 return;
1749
1750 if (q->elevator)
1751 elevator_exit(q->elevator);
1752
1753 blk_sync_queue(q); 1761 blk_sync_queue(q);
1754 1762
1755 if (rl->rq_pool) 1763 if (rl->rq_pool)
@@ -1758,9 +1766,30 @@ void blk_cleanup_queue(request_queue_t * q)
1758 if (q->queue_tags) 1766 if (q->queue_tags)
1759 __blk_queue_free_tags(q); 1767 __blk_queue_free_tags(q);
1760 1768
1769 if (q->blk_trace)
1770 blk_trace_shutdown(q);
1771
1761 kmem_cache_free(requestq_cachep, q); 1772 kmem_cache_free(requestq_cachep, q);
1762} 1773}
1763 1774
1775void blk_put_queue(request_queue_t *q)
1776{
1777 kobject_put(&q->kobj);
1778}
1779EXPORT_SYMBOL(blk_put_queue);
1780
1781void blk_cleanup_queue(request_queue_t * q)
1782{
1783 mutex_lock(&q->sysfs_lock);
1784 set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
1785 mutex_unlock(&q->sysfs_lock);
1786
1787 if (q->elevator)
1788 elevator_exit(q->elevator);
1789
1790 blk_put_queue(q);
1791}
1792
1764EXPORT_SYMBOL(blk_cleanup_queue); 1793EXPORT_SYMBOL(blk_cleanup_queue);
1765 1794
1766static int blk_init_free_list(request_queue_t *q) 1795static int blk_init_free_list(request_queue_t *q)
@@ -1788,6 +1817,8 @@ request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
1788} 1817}
1789EXPORT_SYMBOL(blk_alloc_queue); 1818EXPORT_SYMBOL(blk_alloc_queue);
1790 1819
1820static struct kobj_type queue_ktype;
1821
1791request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) 1822request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1792{ 1823{
1793 request_queue_t *q; 1824 request_queue_t *q;
@@ -1798,11 +1829,16 @@ request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1798 1829
1799 memset(q, 0, sizeof(*q)); 1830 memset(q, 0, sizeof(*q));
1800 init_timer(&q->unplug_timer); 1831 init_timer(&q->unplug_timer);
1801 atomic_set(&q->refcnt, 1); 1832
1833 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
1834 q->kobj.ktype = &queue_ktype;
1835 kobject_init(&q->kobj);
1802 1836
1803 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 1837 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1804 q->backing_dev_info.unplug_io_data = q; 1838 q->backing_dev_info.unplug_io_data = q;
1805 1839
1840 mutex_init(&q->sysfs_lock);
1841
1806 return q; 1842 return q;
1807} 1843}
1808EXPORT_SYMBOL(blk_alloc_queue_node); 1844EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -1854,8 +1890,10 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1854 return NULL; 1890 return NULL;
1855 1891
1856 q->node = node_id; 1892 q->node = node_id;
1857 if (blk_init_free_list(q)) 1893 if (blk_init_free_list(q)) {
1858 goto out_init; 1894 kmem_cache_free(requestq_cachep, q);
1895 return NULL;
1896 }
1859 1897
1860 /* 1898 /*
1861 * if caller didn't supply a lock, they get per-queue locking with 1899 * if caller didn't supply a lock, they get per-queue locking with
@@ -1891,9 +1929,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1891 return q; 1929 return q;
1892 } 1930 }
1893 1931
1894 blk_cleanup_queue(q); 1932 blk_put_queue(q);
1895out_init:
1896 kmem_cache_free(requestq_cachep, q);
1897 return NULL; 1933 return NULL;
1898} 1934}
1899EXPORT_SYMBOL(blk_init_queue_node); 1935EXPORT_SYMBOL(blk_init_queue_node);
@@ -1901,7 +1937,7 @@ EXPORT_SYMBOL(blk_init_queue_node);
1901int blk_get_queue(request_queue_t *q) 1937int blk_get_queue(request_queue_t *q)
1902{ 1938{
1903 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 1939 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1904 atomic_inc(&q->refcnt); 1940 kobject_get(&q->kobj);
1905 return 0; 1941 return 0;
1906 } 1942 }
1907 1943
@@ -2109,6 +2145,8 @@ rq_starved:
2109 2145
2110 rq_init(q, rq); 2146 rq_init(q, rq);
2111 rq->rl = rl; 2147 rq->rl = rl;
2148
2149 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
2112out: 2150out:
2113 return rq; 2151 return rq;
2114} 2152}
@@ -2137,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
2137 if (!rq) { 2175 if (!rq) {
2138 struct io_context *ioc; 2176 struct io_context *ioc;
2139 2177
2178 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
2179
2140 __generic_unplug_device(q); 2180 __generic_unplug_device(q);
2141 spin_unlock_irq(q->queue_lock); 2181 spin_unlock_irq(q->queue_lock);
2142 io_schedule(); 2182 io_schedule();
@@ -2190,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
2190 */ 2230 */
2191void blk_requeue_request(request_queue_t *q, struct request *rq) 2231void blk_requeue_request(request_queue_t *q, struct request *rq)
2192{ 2232{
2233 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
2234
2193 if (blk_rq_tagged(rq)) 2235 if (blk_rq_tagged(rq))
2194 blk_queue_end_tag(q, rq); 2236 blk_queue_end_tag(q, rq);
2195 2237
@@ -2437,10 +2479,12 @@ void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
2437 rq->rq_disk = bd_disk; 2479 rq->rq_disk = bd_disk;
2438 rq->flags |= REQ_NOMERGE; 2480 rq->flags |= REQ_NOMERGE;
2439 rq->end_io = done; 2481 rq->end_io = done;
2440 elv_add_request(q, rq, where, 1); 2482 WARN_ON(irqs_disabled());
2441 generic_unplug_device(q); 2483 spin_lock_irq(q->queue_lock);
2484 __elv_add_request(q, rq, where, 1);
2485 __generic_unplug_device(q);
2486 spin_unlock_irq(q->queue_lock);
2442} 2487}
2443
2444EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 2488EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
2445 2489
2446/** 2490/**
@@ -2824,6 +2868,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
2824 if (!q->back_merge_fn(q, req, bio)) 2868 if (!q->back_merge_fn(q, req, bio))
2825 break; 2869 break;
2826 2870
2871 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
2872
2827 req->biotail->bi_next = bio; 2873 req->biotail->bi_next = bio;
2828 req->biotail = bio; 2874 req->biotail = bio;
2829 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2875 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2839,6 +2885,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
2839 if (!q->front_merge_fn(q, req, bio)) 2885 if (!q->front_merge_fn(q, req, bio))
2840 break; 2886 break;
2841 2887
2888 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
2889
2842 bio->bi_next = req->bio; 2890 bio->bi_next = req->bio;
2843 req->bio = bio; 2891 req->bio = bio;
2844 2892
@@ -2956,6 +3004,7 @@ void generic_make_request(struct bio *bio)
2956 request_queue_t *q; 3004 request_queue_t *q;
2957 sector_t maxsector; 3005 sector_t maxsector;
2958 int ret, nr_sectors = bio_sectors(bio); 3006 int ret, nr_sectors = bio_sectors(bio);
3007 dev_t old_dev;
2959 3008
2960 might_sleep(); 3009 might_sleep();
2961 /* Test device or partition size, when known. */ 3010 /* Test device or partition size, when known. */
@@ -2982,6 +3031,8 @@ void generic_make_request(struct bio *bio)
2982 * NOTE: we don't repeat the blk_size check for each new device. 3031 * NOTE: we don't repeat the blk_size check for each new device.
2983 * Stacking drivers are expected to know what they are doing. 3032 * Stacking drivers are expected to know what they are doing.
2984 */ 3033 */
3034 maxsector = -1;
3035 old_dev = 0;
2985 do { 3036 do {
2986 char b[BDEVNAME_SIZE]; 3037 char b[BDEVNAME_SIZE];
2987 3038
@@ -3014,6 +3065,15 @@ end_io:
3014 */ 3065 */
3015 blk_partition_remap(bio); 3066 blk_partition_remap(bio);
3016 3067
3068 if (maxsector != -1)
3069 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
3070 maxsector);
3071
3072 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
3073
3074 maxsector = bio->bi_sector;
3075 old_dev = bio->bi_bdev->bd_dev;
3076
3017 ret = q->make_request_fn(q, bio); 3077 ret = q->make_request_fn(q, bio);
3018 } while (ret); 3078 } while (ret);
3019} 3079}
@@ -3133,6 +3193,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
3133 int total_bytes, bio_nbytes, error, next_idx = 0; 3193 int total_bytes, bio_nbytes, error, next_idx = 0;
3134 struct bio *bio; 3194 struct bio *bio;
3135 3195
3196 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
3197
3136 /* 3198 /*
3137 * extend uptodate bool to allow < 0 value to be direct io error 3199 * extend uptodate bool to allow < 0 value to be direct io error
3138 */ 3200 */
@@ -3452,7 +3514,7 @@ int __init blk_dev_init(void)
3452 iocontext_cachep = kmem_cache_create("blkdev_ioc", 3514 iocontext_cachep = kmem_cache_create("blkdev_ioc",
3453 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL); 3515 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
3454 3516
3455 for_each_cpu(i) 3517 for_each_possible_cpu(i)
3456 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); 3518 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
3457 3519
3458 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); 3520 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
@@ -3477,10 +3539,18 @@ void put_io_context(struct io_context *ioc)
3477 BUG_ON(atomic_read(&ioc->refcount) == 0); 3539 BUG_ON(atomic_read(&ioc->refcount) == 0);
3478 3540
3479 if (atomic_dec_and_test(&ioc->refcount)) { 3541 if (atomic_dec_and_test(&ioc->refcount)) {
3542 struct cfq_io_context *cic;
3543
3544 rcu_read_lock();
3480 if (ioc->aic && ioc->aic->dtor) 3545 if (ioc->aic && ioc->aic->dtor)
3481 ioc->aic->dtor(ioc->aic); 3546 ioc->aic->dtor(ioc->aic);
3482 if (ioc->cic && ioc->cic->dtor) 3547 if (ioc->cic_root.rb_node != NULL) {
3483 ioc->cic->dtor(ioc->cic); 3548 struct rb_node *n = rb_first(&ioc->cic_root);
3549
3550 cic = rb_entry(n, struct cfq_io_context, rb_node);
3551 cic->dtor(ioc);
3552 }
3553 rcu_read_unlock();
3484 3554
3485 kmem_cache_free(iocontext_cachep, ioc); 3555 kmem_cache_free(iocontext_cachep, ioc);
3486 } 3556 }
@@ -3492,6 +3562,7 @@ void exit_io_context(void)
3492{ 3562{
3493 unsigned long flags; 3563 unsigned long flags;
3494 struct io_context *ioc; 3564 struct io_context *ioc;
3565 struct cfq_io_context *cic;
3495 3566
3496 local_irq_save(flags); 3567 local_irq_save(flags);
3497 task_lock(current); 3568 task_lock(current);
@@ -3503,9 +3574,11 @@ void exit_io_context(void)
3503 3574
3504 if (ioc->aic && ioc->aic->exit) 3575 if (ioc->aic && ioc->aic->exit)
3505 ioc->aic->exit(ioc->aic); 3576 ioc->aic->exit(ioc->aic);
3506 if (ioc->cic && ioc->cic->exit) 3577 if (ioc->cic_root.rb_node != NULL) {
3507 ioc->cic->exit(ioc->cic); 3578 cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
3508 3579 cic->exit(ioc);
3580 }
3581
3509 put_io_context(ioc); 3582 put_io_context(ioc);
3510} 3583}
3511 3584
@@ -3534,7 +3607,7 @@ struct io_context *current_io_context(gfp_t gfp_flags)
3534 ret->last_waited = jiffies; /* doesn't matter... */ 3607 ret->last_waited = jiffies; /* doesn't matter... */
3535 ret->nr_batch_requests = 0; /* because this is 0 */ 3608 ret->nr_batch_requests = 0; /* because this is 0 */
3536 ret->aic = NULL; 3609 ret->aic = NULL;
3537 ret->cic = NULL; 3610 ret->cic_root.rb_node = NULL;
3538 tsk->io_context = ret; 3611 tsk->io_context = ret;
3539 } 3612 }
3540 3613
@@ -3614,10 +3687,13 @@ static ssize_t
3614queue_requests_store(struct request_queue *q, const char *page, size_t count) 3687queue_requests_store(struct request_queue *q, const char *page, size_t count)
3615{ 3688{
3616 struct request_list *rl = &q->rq; 3689 struct request_list *rl = &q->rq;
3690 unsigned long nr;
3691 int ret = queue_var_store(&nr, page, count);
3692 if (nr < BLKDEV_MIN_RQ)
3693 nr = BLKDEV_MIN_RQ;
3617 3694
3618 int ret = queue_var_store(&q->nr_requests, page, count); 3695 spin_lock_irq(q->queue_lock);
3619 if (q->nr_requests < BLKDEV_MIN_RQ) 3696 q->nr_requests = nr;
3620 q->nr_requests = BLKDEV_MIN_RQ;
3621 blk_queue_congestion_threshold(q); 3697 blk_queue_congestion_threshold(q);
3622 3698
3623 if (rl->count[READ] >= queue_congestion_on_threshold(q)) 3699 if (rl->count[READ] >= queue_congestion_on_threshold(q))
@@ -3643,6 +3719,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
3643 blk_clear_queue_full(q, WRITE); 3719 blk_clear_queue_full(q, WRITE);
3644 wake_up(&rl->wait[WRITE]); 3720 wake_up(&rl->wait[WRITE]);
3645 } 3721 }
3722 spin_unlock_irq(q->queue_lock);
3646 return ret; 3723 return ret;
3647} 3724}
3648 3725
@@ -3758,13 +3835,19 @@ static ssize_t
3758queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3835queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3759{ 3836{
3760 struct queue_sysfs_entry *entry = to_queue(attr); 3837 struct queue_sysfs_entry *entry = to_queue(attr);
3761 struct request_queue *q; 3838 request_queue_t *q = container_of(kobj, struct request_queue, kobj);
3839 ssize_t res;
3762 3840
3763 q = container_of(kobj, struct request_queue, kobj);
3764 if (!entry->show) 3841 if (!entry->show)
3765 return -EIO; 3842 return -EIO;
3766 3843 mutex_lock(&q->sysfs_lock);
3767 return entry->show(q, page); 3844 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
3845 mutex_unlock(&q->sysfs_lock);
3846 return -ENOENT;
3847 }
3848 res = entry->show(q, page);
3849 mutex_unlock(&q->sysfs_lock);
3850 return res;
3768} 3851}
3769 3852
3770static ssize_t 3853static ssize_t
@@ -3772,13 +3855,20 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
3772 const char *page, size_t length) 3855 const char *page, size_t length)
3773{ 3856{
3774 struct queue_sysfs_entry *entry = to_queue(attr); 3857 struct queue_sysfs_entry *entry = to_queue(attr);
3775 struct request_queue *q; 3858 request_queue_t *q = container_of(kobj, struct request_queue, kobj);
3859
3860 ssize_t res;
3776 3861
3777 q = container_of(kobj, struct request_queue, kobj);
3778 if (!entry->store) 3862 if (!entry->store)
3779 return -EIO; 3863 return -EIO;
3780 3864 mutex_lock(&q->sysfs_lock);
3781 return entry->store(q, page, length); 3865 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
3866 mutex_unlock(&q->sysfs_lock);
3867 return -ENOENT;
3868 }
3869 res = entry->store(q, page, length);
3870 mutex_unlock(&q->sysfs_lock);
3871 return res;
3782} 3872}
3783 3873
3784static struct sysfs_ops queue_sysfs_ops = { 3874static struct sysfs_ops queue_sysfs_ops = {
@@ -3789,6 +3879,7 @@ static struct sysfs_ops queue_sysfs_ops = {
3789static struct kobj_type queue_ktype = { 3879static struct kobj_type queue_ktype = {
3790 .sysfs_ops = &queue_sysfs_ops, 3880 .sysfs_ops = &queue_sysfs_ops,
3791 .default_attrs = default_attrs, 3881 .default_attrs = default_attrs,
3882 .release = blk_release_queue,
3792}; 3883};
3793 3884
3794int blk_register_queue(struct gendisk *disk) 3885int blk_register_queue(struct gendisk *disk)
@@ -3801,19 +3892,17 @@ int blk_register_queue(struct gendisk *disk)
3801 return -ENXIO; 3892 return -ENXIO;
3802 3893
3803 q->kobj.parent = kobject_get(&disk->kobj); 3894 q->kobj.parent = kobject_get(&disk->kobj);
3804 if (!q->kobj.parent)
3805 return -EBUSY;
3806 3895
3807 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); 3896 ret = kobject_add(&q->kobj);
3808 q->kobj.ktype = &queue_ktype;
3809
3810 ret = kobject_register(&q->kobj);
3811 if (ret < 0) 3897 if (ret < 0)
3812 return ret; 3898 return ret;
3813 3899
3900 kobject_uevent(&q->kobj, KOBJ_ADD);
3901
3814 ret = elv_register_queue(q); 3902 ret = elv_register_queue(q);
3815 if (ret) { 3903 if (ret) {
3816 kobject_unregister(&q->kobj); 3904 kobject_uevent(&q->kobj, KOBJ_REMOVE);
3905 kobject_del(&q->kobj);
3817 return ret; 3906 return ret;
3818 } 3907 }
3819 3908
@@ -3827,7 +3916,8 @@ void blk_unregister_queue(struct gendisk *disk)
3827 if (q && q->request_fn) { 3916 if (q && q->request_fn) {
3828 elv_unregister_queue(q); 3917 elv_unregister_queue(q);
3829 3918
3830 kobject_unregister(&q->kobj); 3919 kobject_uevent(&q->kobj, KOBJ_REMOVE);
3920 kobject_del(&q->kobj);
3831 kobject_put(&disk->kobj); 3921 kobject_put(&disk->kobj);
3832 } 3922 }
3833} 3923}