aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig2
-rw-r--r--block/blk-cgroup.c4
-rw-r--r--block/blk-core.c40
-rw-r--r--block/blk-ioc.c5
-rw-r--r--block/blk-merge.c3
-rw-r--r--block/blk-throttle.c10
-rw-r--r--block/cfq-iosched.c140
-rw-r--r--block/genhd.c550
-rw-r--r--block/ioctl.c5
9 files changed, 650 insertions, 109 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 6c9213ef15a1..60be1e0455da 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -2,7 +2,7 @@
2# Block layer core configuration 2# Block layer core configuration
3# 3#
4menuconfig BLOCK 4menuconfig BLOCK
5 bool "Enable the block layer" if EMBEDDED 5 bool "Enable the block layer" if EXPERT
6 default y 6 default y
7 help 7 help
8 Provide block layer support for the kernel. 8 Provide block layer support for the kernel.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1febd0f6d2a..455768a3eb9e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1452 goto done; 1452 goto done;
1453 } 1453 }
1454 1454
1455 /* Currently we do not support hierarchy deeper than two level (0,1) */
1456 if (parent != cgroup->top_cgroup)
1457 return ERR_PTR(-EPERM);
1458
1459 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1455 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1460 if (!blkcg) 1456 if (!blkcg)
1461 return ERR_PTR(-ENOMEM); 1457 return ERR_PTR(-ENOMEM);
diff --git a/block/blk-core.c b/block/blk-core.c
index ab4a7696956d..3cc17e6064d6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,7 +33,7 @@
33 33
34#include "blk.h" 34#include "blk.h"
35 35
36EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); 36EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
37EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 37EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
38EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); 38EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
39 39
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
64 return; 64 return;
65 65
66 cpu = part_stat_lock(); 66 cpu = part_stat_lock();
67 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 67
69 if (!new_io) 68 if (!new_io) {
69 part = rq->part;
70 part_stat_inc(cpu, part, merges[rw]); 70 part_stat_inc(cpu, part, merges[rw]);
71 else { 71 } else {
72 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
73 if (!hd_struct_try_get(part)) {
74 /*
75 * The partition is already being removed,
76 * the request will be accounted on the disk only
77 *
78 * We take a reference on disk->part0 although that
79 * partition will never be deleted, so we can treat
80 * it as any other partition.
81 */
82 part = &rq->rq_disk->part0;
83 hd_struct_get(part);
84 }
72 part_round_stats(cpu, part); 85 part_round_stats(cpu, part);
73 part_inc_in_flight(part, rw); 86 part_inc_in_flight(part, rw);
87 rq->part = part;
74 } 88 }
75 89
76 part_stat_unlock(); 90 part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
128 rq->ref_count = 1; 142 rq->ref_count = 1;
129 rq->start_time = jiffies; 143 rq->start_time = jiffies;
130 set_start_time_ns(rq); 144 set_start_time_ns(rq);
145 rq->part = NULL;
131} 146}
132EXPORT_SYMBOL(blk_rq_init); 147EXPORT_SYMBOL(blk_rq_init);
133 148
@@ -1342,9 +1357,9 @@ static inline void blk_partition_remap(struct bio *bio)
1342 bio->bi_sector += p->start_sect; 1357 bio->bi_sector += p->start_sect;
1343 bio->bi_bdev = bdev->bd_contains; 1358 bio->bi_bdev = bdev->bd_contains;
1344 1359
1345 trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, 1360 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
1346 bdev->bd_dev, 1361 bdev->bd_dev,
1347 bio->bi_sector - p->start_sect); 1362 bio->bi_sector - p->start_sect);
1348 } 1363 }
1349} 1364}
1350 1365
@@ -1513,7 +1528,7 @@ static inline void __generic_make_request(struct bio *bio)
1513 goto end_io; 1528 goto end_io;
1514 1529
1515 if (old_sector != -1) 1530 if (old_sector != -1)
1516 trace_block_remap(q, bio, old_dev, old_sector); 1531 trace_block_bio_remap(q, bio, old_dev, old_sector);
1517 1532
1518 old_sector = bio->bi_sector; 1533 old_sector = bio->bi_sector;
1519 old_dev = bio->bi_bdev->bd_dev; 1534 old_dev = bio->bi_bdev->bd_dev;
@@ -1789,7 +1804,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1789 int cpu; 1804 int cpu;
1790 1805
1791 cpu = part_stat_lock(); 1806 cpu = part_stat_lock();
1792 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1807 part = req->part;
1793 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 1808 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1794 part_stat_unlock(); 1809 part_stat_unlock();
1795 } 1810 }
@@ -1809,13 +1824,14 @@ static void blk_account_io_done(struct request *req)
1809 int cpu; 1824 int cpu;
1810 1825
1811 cpu = part_stat_lock(); 1826 cpu = part_stat_lock();
1812 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1827 part = req->part;
1813 1828
1814 part_stat_inc(cpu, part, ios[rw]); 1829 part_stat_inc(cpu, part, ios[rw]);
1815 part_stat_add(cpu, part, ticks[rw], duration); 1830 part_stat_add(cpu, part, ticks[rw], duration);
1816 part_round_stats(cpu, part); 1831 part_round_stats(cpu, part);
1817 part_dec_in_flight(part, rw); 1832 part_dec_in_flight(part, rw);
1818 1833
1834 hd_struct_put(part);
1819 part_stat_unlock(); 1835 part_stat_unlock();
1820 } 1836 }
1821} 1837}
@@ -2619,7 +2635,9 @@ int __init blk_dev_init(void)
2619 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2635 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
2620 sizeof(((struct request *)0)->cmd_flags)); 2636 sizeof(((struct request *)0)->cmd_flags));
2621 2637
2622 kblockd_workqueue = create_workqueue("kblockd"); 2638 /* used for unplugging and affects IO latency/throughput - HIGHPRI */
2639 kblockd_workqueue = alloc_workqueue("kblockd",
2640 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2623 if (!kblockd_workqueue) 2641 if (!kblockd_workqueue)
2624 panic("Failed to create kblockd\n"); 2642 panic("Failed to create kblockd\n");
2625 2643
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 3c7a339fe381..b791022beef3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -64,7 +64,7 @@ static void cfq_exit(struct io_context *ioc)
64 rcu_read_unlock(); 64 rcu_read_unlock();
65} 65}
66 66
67/* Called by the exitting task */ 67/* Called by the exiting task */
68void exit_io_context(struct task_struct *task) 68void exit_io_context(struct task_struct *task)
69{ 69{
70 struct io_context *ioc; 70 struct io_context *ioc;
@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task)
74 task->io_context = NULL; 74 task->io_context = NULL;
75 task_unlock(task); 75 task_unlock(task);
76 76
77 if (atomic_dec_and_test(&ioc->nr_tasks)) { 77 if (atomic_dec_and_test(&ioc->nr_tasks))
78 cfq_exit(ioc); 78 cfq_exit(ioc);
79 79
80 }
81 put_io_context(ioc); 80 put_io_context(ioc);
82} 81}
83 82
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 74bc4a768f32..ea85e20d5e94 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
351 int cpu; 351 int cpu;
352 352
353 cpu = part_stat_lock(); 353 cpu = part_stat_lock();
354 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 354 part = req->part;
355 355
356 part_round_stats(cpu, part); 356 part_round_stats(cpu, part);
357 part_dec_in_flight(part, rq_data_dir(req)); 357 part_dec_in_flight(part, rq_data_dir(req));
358 358
359 hd_struct_put(part);
359 part_stat_unlock(); 360 part_stat_unlock();
360 } 361 }
361} 362}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 381b09bb562b..a89043a3caa4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -168,7 +168,15 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
168 * tree of blkg (instead of traversing through hash list all 168 * tree of blkg (instead of traversing through hash list all
169 * the time. 169 * the time.
170 */ 170 */
171 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); 171
172 /*
173 * This is the common case when there are no blkio cgroups.
174 * Avoid lookup in this case
175 */
176 if (blkcg == &blkio_root_cgroup)
177 tg = &td->root_tg;
178 else
179 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
172 180
173 /* Fill in device details for root group */ 181 /* Fill in device details for root group */
174 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 182 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 968455c57e1a..f27ff3efe6cd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -87,7 +87,6 @@ struct cfq_rb_root {
87 unsigned count; 87 unsigned count;
88 unsigned total_weight; 88 unsigned total_weight;
89 u64 min_vdisktime; 89 u64 min_vdisktime;
90 struct rb_node *active;
91}; 90};
92#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 91#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
93 .count = 0, .min_vdisktime = 0, } 92 .count = 0, .min_vdisktime = 0, }
@@ -97,7 +96,7 @@ struct cfq_rb_root {
97 */ 96 */
98struct cfq_queue { 97struct cfq_queue {
99 /* reference count */ 98 /* reference count */
100 atomic_t ref; 99 int ref;
101 /* various state flags, see below */ 100 /* various state flags, see below */
102 unsigned int flags; 101 unsigned int flags;
103 /* parent cfq_data */ 102 /* parent cfq_data */
@@ -180,7 +179,6 @@ struct cfq_group {
180 /* group service_tree key */ 179 /* group service_tree key */
181 u64 vdisktime; 180 u64 vdisktime;
182 unsigned int weight; 181 unsigned int weight;
183 bool on_st;
184 182
185 /* number of cfqq currently on this group */ 183 /* number of cfqq currently on this group */
186 int nr_cfqq; 184 int nr_cfqq;
@@ -209,7 +207,7 @@ struct cfq_group {
209 struct blkio_group blkg; 207 struct blkio_group blkg;
210#ifdef CONFIG_CFQ_GROUP_IOSCHED 208#ifdef CONFIG_CFQ_GROUP_IOSCHED
211 struct hlist_node cfqd_node; 209 struct hlist_node cfqd_node;
212 atomic_t ref; 210 int ref;
213#endif 211#endif
214 /* number of requests that are on the dispatch list or inside driver */ 212 /* number of requests that are on the dispatch list or inside driver */
215 int dispatched; 213 int dispatched;
@@ -563,11 +561,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st)
563 u64 vdisktime = st->min_vdisktime; 561 u64 vdisktime = st->min_vdisktime;
564 struct cfq_group *cfqg; 562 struct cfq_group *cfqg;
565 563
566 if (st->active) {
567 cfqg = rb_entry_cfqg(st->active);
568 vdisktime = cfqg->vdisktime;
569 }
570
571 if (st->left) { 564 if (st->left) {
572 cfqg = rb_entry_cfqg(st->left); 565 cfqg = rb_entry_cfqg(st->left);
573 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 566 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
@@ -605,8 +598,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
605 return cfq_target_latency * cfqg->weight / st->total_weight; 598 return cfq_target_latency * cfqg->weight / st->total_weight;
606} 599}
607 600
608static inline void 601static inline unsigned
609cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 602cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
610{ 603{
611 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 604 unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
612 if (cfqd->cfq_latency) { 605 if (cfqd->cfq_latency) {
@@ -632,6 +625,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
632 low_slice); 625 low_slice);
633 } 626 }
634 } 627 }
628 return slice;
629}
630
631static inline void
632cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
633{
634 unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
635
635 cfqq->slice_start = jiffies; 636 cfqq->slice_start = jiffies;
636 cfqq->slice_end = jiffies + slice; 637 cfqq->slice_end = jiffies + slice;
637 cfqq->allocated_slice = slice; 638 cfqq->allocated_slice = slice;
@@ -646,11 +647,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
646static inline bool cfq_slice_used(struct cfq_queue *cfqq) 647static inline bool cfq_slice_used(struct cfq_queue *cfqq)
647{ 648{
648 if (cfq_cfqq_slice_new(cfqq)) 649 if (cfq_cfqq_slice_new(cfqq))
649 return 0; 650 return false;
650 if (time_before(jiffies, cfqq->slice_end)) 651 if (time_before(jiffies, cfqq->slice_end))
651 return 0; 652 return false;
652 653
653 return 1; 654 return true;
654} 655}
655 656
656/* 657/*
@@ -869,7 +870,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
869 struct rb_node *n; 870 struct rb_node *n;
870 871
871 cfqg->nr_cfqq++; 872 cfqg->nr_cfqq++;
872 if (cfqg->on_st) 873 if (!RB_EMPTY_NODE(&cfqg->rb_node))
873 return; 874 return;
874 875
875 /* 876 /*
@@ -885,7 +886,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
885 cfqg->vdisktime = st->min_vdisktime; 886 cfqg->vdisktime = st->min_vdisktime;
886 887
887 __cfq_group_service_tree_add(st, cfqg); 888 __cfq_group_service_tree_add(st, cfqg);
888 cfqg->on_st = true;
889 st->total_weight += cfqg->weight; 889 st->total_weight += cfqg->weight;
890} 890}
891 891
@@ -894,9 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
894{ 894{
895 struct cfq_rb_root *st = &cfqd->grp_service_tree; 895 struct cfq_rb_root *st = &cfqd->grp_service_tree;
896 896
897 if (st->active == &cfqg->rb_node)
898 st->active = NULL;
899
900 BUG_ON(cfqg->nr_cfqq < 1); 897 BUG_ON(cfqg->nr_cfqq < 1);
901 cfqg->nr_cfqq--; 898 cfqg->nr_cfqq--;
902 899
@@ -905,7 +902,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
905 return; 902 return;
906 903
907 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 904 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
908 cfqg->on_st = false;
909 st->total_weight -= cfqg->weight; 905 st->total_weight -= cfqg->weight;
910 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 906 if (!RB_EMPTY_NODE(&cfqg->rb_node))
911 cfq_rb_erase(&cfqg->rb_node, st); 907 cfq_rb_erase(&cfqg->rb_node, st);
@@ -1026,11 +1022,11 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
1026 * elevator which will be dropped by either elevator exit 1022 * elevator which will be dropped by either elevator exit
1027 * or cgroup deletion path depending on who is exiting first. 1023 * or cgroup deletion path depending on who is exiting first.
1028 */ 1024 */
1029 atomic_set(&cfqg->ref, 1); 1025 cfqg->ref = 1;
1030 1026
1031 /* 1027 /*
1032 * Add group onto cgroup list. It might happen that bdi->dev is 1028 * Add group onto cgroup list. It might happen that bdi->dev is
1033 * not initiliazed yet. Initialize this new group without major 1029 * not initialized yet. Initialize this new group without major
1034 * and minor info and this info will be filled in once a new thread 1030 * and minor info and this info will be filled in once a new thread
1035 * comes for IO. See code above. 1031 * comes for IO. See code above.
1036 */ 1032 */
@@ -1071,7 +1067,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1071 1067
1072static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1068static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1073{ 1069{
1074 atomic_inc(&cfqg->ref); 1070 cfqg->ref++;
1075 return cfqg; 1071 return cfqg;
1076} 1072}
1077 1073
@@ -1083,7 +1079,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1083 1079
1084 cfqq->cfqg = cfqg; 1080 cfqq->cfqg = cfqg;
1085 /* cfqq reference on cfqg */ 1081 /* cfqq reference on cfqg */
1086 atomic_inc(&cfqq->cfqg->ref); 1082 cfqq->cfqg->ref++;
1087} 1083}
1088 1084
1089static void cfq_put_cfqg(struct cfq_group *cfqg) 1085static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1091,11 +1087,12 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
1091 struct cfq_rb_root *st; 1087 struct cfq_rb_root *st;
1092 int i, j; 1088 int i, j;
1093 1089
1094 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1090 BUG_ON(cfqg->ref <= 0);
1095 if (!atomic_dec_and_test(&cfqg->ref)) 1091 cfqg->ref--;
1092 if (cfqg->ref)
1096 return; 1093 return;
1097 for_each_cfqg_st(cfqg, i, j, st) 1094 for_each_cfqg_st(cfqg, i, j, st)
1098 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1095 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1099 kfree(cfqg); 1096 kfree(cfqg);
1100} 1097}
1101 1098
@@ -1200,7 +1197,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1200 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1197 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1201 cfqq->orig_cfqg = cfqq->cfqg; 1198 cfqq->orig_cfqg = cfqq->cfqg;
1202 cfqq->cfqg = &cfqd->root_group; 1199 cfqq->cfqg = &cfqd->root_group;
1203 atomic_inc(&cfqd->root_group.ref); 1200 cfqd->root_group.ref++;
1204 group_changed = 1; 1201 group_changed = 1;
1205 } else if (!cfqd->cfq_group_isolation 1202 } else if (!cfqd->cfq_group_isolation
1206 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 1203 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
@@ -1672,8 +1669,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1672 /* 1669 /*
1673 * store what was left of this slice, if the queue idled/timed out 1670 * store what was left of this slice, if the queue idled/timed out
1674 */ 1671 */
1675 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1672 if (timed_out) {
1676 cfqq->slice_resid = cfqq->slice_end - jiffies; 1673 if (cfq_cfqq_slice_new(cfqq))
1674 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1675 else
1676 cfqq->slice_resid = cfqq->slice_end - jiffies;
1677 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1677 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1678 } 1678 }
1679 1679
@@ -1687,9 +1687,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1687 if (cfqq == cfqd->active_queue) 1687 if (cfqq == cfqd->active_queue)
1688 cfqd->active_queue = NULL; 1688 cfqd->active_queue = NULL;
1689 1689
1690 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1691 cfqd->grp_service_tree.active = NULL;
1692
1693 if (cfqd->active_cic) { 1690 if (cfqd->active_cic) {
1694 put_io_context(cfqd->active_cic->ioc); 1691 put_io_context(cfqd->active_cic->ioc);
1695 cfqd->active_cic = NULL; 1692 cfqd->active_cic = NULL;
@@ -1901,10 +1898,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1901 * in their service tree. 1898 * in their service tree.
1902 */ 1899 */
1903 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1900 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1904 return 1; 1901 return true;
1905 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1902 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1906 service_tree->count); 1903 service_tree->count);
1907 return 0; 1904 return false;
1908} 1905}
1909 1906
1910static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1907static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -2040,7 +2037,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
2040 int process_refs, io_refs; 2037 int process_refs, io_refs;
2041 2038
2042 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 2039 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2043 process_refs = atomic_read(&cfqq->ref) - io_refs; 2040 process_refs = cfqq->ref - io_refs;
2044 BUG_ON(process_refs < 0); 2041 BUG_ON(process_refs < 0);
2045 return process_refs; 2042 return process_refs;
2046} 2043}
@@ -2080,10 +2077,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2080 */ 2077 */
2081 if (new_process_refs >= process_refs) { 2078 if (new_process_refs >= process_refs) {
2082 cfqq->new_cfqq = new_cfqq; 2079 cfqq->new_cfqq = new_cfqq;
2083 atomic_add(process_refs, &new_cfqq->ref); 2080 new_cfqq->ref += process_refs;
2084 } else { 2081 } else {
2085 new_cfqq->new_cfqq = cfqq; 2082 new_cfqq->new_cfqq = cfqq;
2086 atomic_add(new_process_refs, &cfqq->ref); 2083 cfqq->ref += new_process_refs;
2087 } 2084 }
2088} 2085}
2089 2086
@@ -2116,12 +2113,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2116 unsigned count; 2113 unsigned count;
2117 struct cfq_rb_root *st; 2114 struct cfq_rb_root *st;
2118 unsigned group_slice; 2115 unsigned group_slice;
2119 2116 enum wl_prio_t original_prio = cfqd->serving_prio;
2120 if (!cfqg) {
2121 cfqd->serving_prio = IDLE_WORKLOAD;
2122 cfqd->workload_expires = jiffies + 1;
2123 return;
2124 }
2125 2117
2126 /* Choose next priority. RT > BE > IDLE */ 2118 /* Choose next priority. RT > BE > IDLE */
2127 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2119 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
@@ -2134,6 +2126,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2134 return; 2126 return;
2135 } 2127 }
2136 2128
2129 if (original_prio != cfqd->serving_prio)
2130 goto new_workload;
2131
2137 /* 2132 /*
2138 * For RT and BE, we have to choose also the type 2133 * For RT and BE, we have to choose also the type
2139 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2134 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2148,6 +2143,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2148 if (count && !time_after(jiffies, cfqd->workload_expires)) 2143 if (count && !time_after(jiffies, cfqd->workload_expires))
2149 return; 2144 return;
2150 2145
2146new_workload:
2151 /* otherwise select new workload type */ 2147 /* otherwise select new workload type */
2152 cfqd->serving_type = 2148 cfqd->serving_type =
2153 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2149 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@ -2199,7 +2195,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2199 if (RB_EMPTY_ROOT(&st->rb)) 2195 if (RB_EMPTY_ROOT(&st->rb))
2200 return NULL; 2196 return NULL;
2201 cfqg = cfq_rb_first_group(st); 2197 cfqg = cfq_rb_first_group(st);
2202 st->active = &cfqg->rb_node;
2203 update_min_vdisktime(st); 2198 update_min_vdisktime(st);
2204 return cfqg; 2199 return cfqg;
2205} 2200}
@@ -2293,6 +2288,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2293 goto keep_queue; 2288 goto keep_queue;
2294 } 2289 }
2295 2290
2291 /*
2292 * This is a deep seek queue, but the device is much faster than
2293 * the queue can deliver, don't idle
2294 **/
2295 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2296 (cfq_cfqq_slice_new(cfqq) ||
2297 (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2298 cfq_clear_cfqq_deep(cfqq);
2299 cfq_clear_cfqq_idle_window(cfqq);
2300 }
2301
2296 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2302 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2297 cfqq = NULL; 2303 cfqq = NULL;
2298 goto keep_queue; 2304 goto keep_queue;
@@ -2367,12 +2373,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2367{ 2373{
2368 /* the queue hasn't finished any request, can't estimate */ 2374 /* the queue hasn't finished any request, can't estimate */
2369 if (cfq_cfqq_slice_new(cfqq)) 2375 if (cfq_cfqq_slice_new(cfqq))
2370 return 1; 2376 return true;
2371 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 2377 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2372 cfqq->slice_end)) 2378 cfqq->slice_end))
2373 return 1; 2379 return true;
2374 2380
2375 return 0; 2381 return false;
2376} 2382}
2377 2383
2378static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2384static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -2538,9 +2544,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2538 struct cfq_data *cfqd = cfqq->cfqd; 2544 struct cfq_data *cfqd = cfqq->cfqd;
2539 struct cfq_group *cfqg, *orig_cfqg; 2545 struct cfq_group *cfqg, *orig_cfqg;
2540 2546
2541 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2547 BUG_ON(cfqq->ref <= 0);
2542 2548
2543 if (!atomic_dec_and_test(&cfqq->ref)) 2549 cfqq->ref--;
2550 if (cfqq->ref)
2544 return; 2551 return;
2545 2552
2546 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2553 cfq_log_cfqq(cfqd, cfqq, "put_queue");
@@ -2843,7 +2850,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2843 RB_CLEAR_NODE(&cfqq->p_node); 2850 RB_CLEAR_NODE(&cfqq->p_node);
2844 INIT_LIST_HEAD(&cfqq->fifo); 2851 INIT_LIST_HEAD(&cfqq->fifo);
2845 2852
2846 atomic_set(&cfqq->ref, 0); 2853 cfqq->ref = 0;
2847 cfqq->cfqd = cfqd; 2854 cfqq->cfqd = cfqd;
2848 2855
2849 cfq_mark_cfqq_prio_changed(cfqq); 2856 cfq_mark_cfqq_prio_changed(cfqq);
@@ -2979,11 +2986,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2979 * pin the queue now that it's allocated, scheduler exit will prune it 2986 * pin the queue now that it's allocated, scheduler exit will prune it
2980 */ 2987 */
2981 if (!is_sync && !(*async_cfqq)) { 2988 if (!is_sync && !(*async_cfqq)) {
2982 atomic_inc(&cfqq->ref); 2989 cfqq->ref++;
2983 *async_cfqq = cfqq; 2990 *async_cfqq = cfqq;
2984 } 2991 }
2985 2992
2986 atomic_inc(&cfqq->ref); 2993 cfqq->ref++;
2987 return cfqq; 2994 return cfqq;
2988} 2995}
2989 2996
@@ -3265,6 +3272,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3265 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3272 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3266 return true; 3273 return true;
3267 3274
3275 /* An idle queue should not be idle now for some reason */
3276 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3277 return true;
3278
3268 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3279 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3269 return false; 3280 return false;
3270 3281
@@ -3284,10 +3295,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3284 */ 3295 */
3285static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3296static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3286{ 3297{
3298 struct cfq_queue *old_cfqq = cfqd->active_queue;
3299
3287 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3300 cfq_log_cfqq(cfqd, cfqq, "preempt");
3288 cfq_slice_expired(cfqd, 1); 3301 cfq_slice_expired(cfqd, 1);
3289 3302
3290 /* 3303 /*
3304 * workload type is changed, don't save slice, otherwise preempt
3305 * doesn't happen
3306 */
3307 if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
3308 cfqq->cfqg->saved_workload_slice = 0;
3309
3310 /*
3291 * Put the new queue at the front of the of the current list, 3311 * Put the new queue at the front of the of the current list,
3292 * so we know that it will be selected next. 3312 * so we know that it will be selected next.
3293 */ 3313 */
@@ -3412,6 +3432,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3412{ 3432{
3413 struct cfq_io_context *cic = cfqd->active_cic; 3433 struct cfq_io_context *cic = cfqd->active_cic;
3414 3434
3435 /* If the queue already has requests, don't wait */
3436 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3437 return false;
3438
3415 /* If there are other queues in the group, don't wait */ 3439 /* If there are other queues in the group, don't wait */
3416 if (cfqq->cfqg->nr_cfqq > 1) 3440 if (cfqq->cfqg->nr_cfqq > 1)
3417 return false; 3441 return false;
@@ -3681,10 +3705,10 @@ new_queue:
3681 } 3705 }
3682 3706
3683 cfqq->allocated[rw]++; 3707 cfqq->allocated[rw]++;
3684 atomic_inc(&cfqq->ref);
3685 3708
3686 spin_unlock_irqrestore(q->queue_lock, flags); 3709 spin_unlock_irqrestore(q->queue_lock, flags);
3687 3710
3711 cfqq->ref++;
3688 rq->elevator_private[0] = cic; 3712 rq->elevator_private[0] = cic;
3689 rq->elevator_private[1] = cfqq; 3713 rq->elevator_private[1] = cfqq;
3690 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); 3714 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
@@ -3862,6 +3886,10 @@ static void *cfq_init_queue(struct request_queue *q)
3862 if (!cfqd) 3886 if (!cfqd)
3863 return NULL; 3887 return NULL;
3864 3888
3889 /*
3890 * Don't need take queue_lock in the routine, since we are
3891 * initializing the ioscheduler, and nobody is using cfqd
3892 */
3865 cfqd->cic_index = i; 3893 cfqd->cic_index = i;
3866 3894
3867 /* Init root service tree */ 3895 /* Init root service tree */
@@ -3881,7 +3909,7 @@ static void *cfq_init_queue(struct request_queue *q)
3881 * Take a reference to root group which we never drop. This is just 3909 * Take a reference to root group which we never drop. This is just
3882 * to make sure that cfq_put_cfqg() does not try to kfree root group 3910 * to make sure that cfq_put_cfqg() does not try to kfree root group
3883 */ 3911 */
3884 atomic_set(&cfqg->ref, 1); 3912 cfqg->ref = 1;
3885 rcu_read_lock(); 3913 rcu_read_lock();
3886 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 3914 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3887 (void *)cfqd, 0); 3915 (void *)cfqd, 0);
@@ -3901,7 +3929,7 @@ static void *cfq_init_queue(struct request_queue *q)
3901 * will not attempt to free it. 3929 * will not attempt to free it.
3902 */ 3930 */
3903 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3931 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3904 atomic_inc(&cfqd->oom_cfqq.ref); 3932 cfqd->oom_cfqq.ref++;
3905 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3933 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3906 3934
3907 INIT_LIST_HEAD(&cfqd->cic_list); 3935 INIT_LIST_HEAD(&cfqd->cic_list);
diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44a72ff..6a5b772aa201 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
18#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/log2.h>
21 22
22#include "blk.h" 23#include "blk.h"
23 24
@@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
35 36
36static struct device_type disk_type; 37static struct device_type disk_type;
37 38
39static void disk_add_events(struct gendisk *disk);
40static void disk_del_events(struct gendisk *disk);
41static void disk_release_events(struct gendisk *disk);
42
38/** 43/**
39 * disk_get_part - get partition 44 * disk_get_part - get partition
40 * @disk: disk to look partition from 45 * @disk: disk to look partition from
@@ -239,7 +244,7 @@ static struct blk_major_name {
239} *major_names[BLKDEV_MAJOR_HASH_SIZE]; 244} *major_names[BLKDEV_MAJOR_HASH_SIZE];
240 245
241/* index in the above - for now: assume no multimajor ranges */ 246/* index in the above - for now: assume no multimajor ranges */
242static inline int major_to_index(int major) 247static inline int major_to_index(unsigned major)
243{ 248{
244 return major % BLKDEV_MAJOR_HASH_SIZE; 249 return major % BLKDEV_MAJOR_HASH_SIZE;
245} 250}
@@ -502,6 +507,64 @@ static int exact_lock(dev_t devt, void *data)
502 return 0; 507 return 0;
503} 508}
504 509
510void register_disk(struct gendisk *disk)
511{
512 struct device *ddev = disk_to_dev(disk);
513 struct block_device *bdev;
514 struct disk_part_iter piter;
515 struct hd_struct *part;
516 int err;
517
518 ddev->parent = disk->driverfs_dev;
519
520 dev_set_name(ddev, disk->disk_name);
521
522 /* delay uevents, until we scanned partition table */
523 dev_set_uevent_suppress(ddev, 1);
524
525 if (device_add(ddev))
526 return;
527 if (!sysfs_deprecated) {
528 err = sysfs_create_link(block_depr, &ddev->kobj,
529 kobject_name(&ddev->kobj));
530 if (err) {
531 device_del(ddev);
532 return;
533 }
534 }
535 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
536 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
537
538 /* No minors to use for partitions */
539 if (!disk_partitionable(disk))
540 goto exit;
541
542 /* No such device (e.g., media were just removed) */
543 if (!get_capacity(disk))
544 goto exit;
545
546 bdev = bdget_disk(disk, 0);
547 if (!bdev)
548 goto exit;
549
550 bdev->bd_invalidated = 1;
551 err = blkdev_get(bdev, FMODE_READ, NULL);
552 if (err < 0)
553 goto exit;
554 blkdev_put(bdev, FMODE_READ);
555
556exit:
557 /* announce disk after possible partitions are created */
558 dev_set_uevent_suppress(ddev, 0);
559 kobject_uevent(&ddev->kobj, KOBJ_ADD);
560
561 /* announce possible partitions */
562 disk_part_iter_init(&piter, disk, 0);
563 while ((part = disk_part_iter_next(&piter)))
564 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
565 disk_part_iter_exit(&piter);
566}
567
505/** 568/**
506 * add_disk - add partitioning information to kernel list 569 * add_disk - add partitioning information to kernel list
507 * @disk: per-device partitioning information 570 * @disk: per-device partitioning information
@@ -551,18 +614,48 @@ void add_disk(struct gendisk *disk)
551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 614 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
552 "bdi"); 615 "bdi");
553 WARN_ON(retval); 616 WARN_ON(retval);
554}
555 617
618 disk_add_events(disk);
619}
556EXPORT_SYMBOL(add_disk); 620EXPORT_SYMBOL(add_disk);
557EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
558 621
559void unlink_gendisk(struct gendisk *disk) 622void del_gendisk(struct gendisk *disk)
560{ 623{
624 struct disk_part_iter piter;
625 struct hd_struct *part;
626
627 disk_del_events(disk);
628
629 /* invalidate stuff */
630 disk_part_iter_init(&piter, disk,
631 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
632 while ((part = disk_part_iter_next(&piter))) {
633 invalidate_partition(disk, part->partno);
634 delete_partition(disk, part->partno);
635 }
636 disk_part_iter_exit(&piter);
637
638 invalidate_partition(disk, 0);
639 blk_free_devt(disk_to_dev(disk)->devt);
640 set_capacity(disk, 0);
641 disk->flags &= ~GENHD_FL_UP;
642
561 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); 643 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
562 bdi_unregister(&disk->queue->backing_dev_info); 644 bdi_unregister(&disk->queue->backing_dev_info);
563 blk_unregister_queue(disk); 645 blk_unregister_queue(disk);
564 blk_unregister_region(disk_devt(disk), disk->minors); 646 blk_unregister_region(disk_devt(disk), disk->minors);
647
648 part_stat_set_all(&disk->part0, 0);
649 disk->part0.stamp = 0;
650
651 kobject_put(disk->part0.holder_dir);
652 kobject_put(disk->slave_dir);
653 disk->driverfs_dev = NULL;
654 if (!sysfs_deprecated)
655 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
656 device_del(disk_to_dev(disk));
565} 657}
658EXPORT_SYMBOL(del_gendisk);
566 659
567/** 660/**
568 * get_gendisk - get partitioning information for a given device 661 * get_gendisk - get partitioning information for a given device
@@ -735,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
735 static void *p; 828 static void *p;
736 829
737 p = disk_seqf_start(seqf, pos); 830 p = disk_seqf_start(seqf, pos);
738 if (!IS_ERR(p) && p && !*pos) 831 if (!IS_ERR_OR_NULL(p) && !*pos)
739 seq_puts(seqf, "major minor #blocks name\n\n"); 832 seq_puts(seqf, "major minor #blocks name\n\n");
740 return p; 833 return p;
741} 834}
@@ -1005,6 +1098,7 @@ static void disk_release(struct device *dev)
1005{ 1098{
1006 struct gendisk *disk = dev_to_disk(dev); 1099 struct gendisk *disk = dev_to_disk(dev);
1007 1100
1101 disk_release_events(disk);
1008 kfree(disk->random); 1102 kfree(disk->random);
1009 disk_replace_part_tbl(disk, NULL); 1103 disk_replace_part_tbl(disk, NULL);
1010 free_part_stats(&disk->part0); 1104 free_part_stats(&disk->part0);
@@ -1110,29 +1204,6 @@ static int __init proc_genhd_init(void)
1110module_init(proc_genhd_init); 1204module_init(proc_genhd_init);
1111#endif /* CONFIG_PROC_FS */ 1205#endif /* CONFIG_PROC_FS */
1112 1206
1113static void media_change_notify_thread(struct work_struct *work)
1114{
1115 struct gendisk *gd = container_of(work, struct gendisk, async_notify);
1116 char event[] = "MEDIA_CHANGE=1";
1117 char *envp[] = { event, NULL };
1118
1119 /*
1120 * set enviroment vars to indicate which event this is for
1121 * so that user space will know to go check the media status.
1122 */
1123 kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1124 put_device(gd->driverfs_dev);
1125}
1126
1127#if 0
1128void genhd_media_change_notify(struct gendisk *disk)
1129{
1130 get_device(disk->driverfs_dev);
1131 schedule_work(&disk->async_notify);
1132}
1133EXPORT_SYMBOL_GPL(genhd_media_change_notify);
1134#endif /* 0 */
1135
1136dev_t blk_lookup_devt(const char *name, int partno) 1207dev_t blk_lookup_devt(const char *name, int partno)
1137{ 1208{
1138 dev_t devt = MKDEV(0, 0); 1209 dev_t devt = MKDEV(0, 0);
@@ -1193,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
1193 } 1264 }
1194 disk->part_tbl->part[0] = &disk->part0; 1265 disk->part_tbl->part[0] = &disk->part0;
1195 1266
1267 hd_ref_init(&disk->part0);
1268
1196 disk->minors = minors; 1269 disk->minors = minors;
1197 rand_initialize_disk(disk); 1270 rand_initialize_disk(disk);
1198 disk_to_dev(disk)->class = &block_class; 1271 disk_to_dev(disk)->class = &block_class;
1199 disk_to_dev(disk)->type = &disk_type; 1272 disk_to_dev(disk)->type = &disk_type;
1200 device_initialize(disk_to_dev(disk)); 1273 device_initialize(disk_to_dev(disk));
1201 INIT_WORK(&disk->async_notify,
1202 media_change_notify_thread);
1203 } 1274 }
1204 return disk; 1275 return disk;
1205} 1276}
@@ -1291,3 +1362,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
1291} 1362}
1292 1363
1293EXPORT_SYMBOL(invalidate_partition); 1364EXPORT_SYMBOL(invalidate_partition);
1365
1366/*
1367 * Disk events - monitor disk events like media change and eject request.
1368 */
1369struct disk_events {
1370 struct list_head node; /* all disk_event's */
1371 struct gendisk *disk; /* the associated disk */
1372 spinlock_t lock;
1373
1374 int block; /* event blocking depth */
1375 unsigned int pending; /* events already sent out */
1376 unsigned int clearing; /* events being cleared */
1377
1378 long poll_msecs; /* interval, -1 for default */
1379 struct delayed_work dwork;
1380};
1381
1382static const char *disk_events_strs[] = {
1383 [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
1384 [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
1385};
1386
1387static char *disk_uevents[] = {
1388 [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
1389 [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
1390};
1391
1392/* list of all disk_events */
1393static DEFINE_MUTEX(disk_events_mutex);
1394static LIST_HEAD(disk_events);
1395
1396/* disable in-kernel polling by default */
1397static unsigned long disk_events_dfl_poll_msecs = 0;
1398
1399static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1400{
1401 struct disk_events *ev = disk->ev;
1402 long intv_msecs = 0;
1403
1404 /*
1405 * If device-specific poll interval is set, always use it. If
1406 * the default is being used, poll iff there are events which
1407 * can't be monitored asynchronously.
1408 */
1409 if (ev->poll_msecs >= 0)
1410 intv_msecs = ev->poll_msecs;
1411 else if (disk->events & ~disk->async_events)
1412 intv_msecs = disk_events_dfl_poll_msecs;
1413
1414 return msecs_to_jiffies(intv_msecs);
1415}
1416
1417static void __disk_block_events(struct gendisk *disk, bool sync)
1418{
1419 struct disk_events *ev = disk->ev;
1420 unsigned long flags;
1421 bool cancel;
1422
1423 spin_lock_irqsave(&ev->lock, flags);
1424 cancel = !ev->block++;
1425 spin_unlock_irqrestore(&ev->lock, flags);
1426
1427 if (cancel) {
1428 if (sync)
1429 cancel_delayed_work_sync(&disk->ev->dwork);
1430 else
1431 cancel_delayed_work(&disk->ev->dwork);
1432 }
1433}
1434
1435static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1436{
1437 struct disk_events *ev = disk->ev;
1438 unsigned long intv;
1439 unsigned long flags;
1440
1441 spin_lock_irqsave(&ev->lock, flags);
1442
1443 if (WARN_ON_ONCE(ev->block <= 0))
1444 goto out_unlock;
1445
1446 if (--ev->block)
1447 goto out_unlock;
1448
1449 /*
1450 * Not exactly a latency critical operation, set poll timer
1451 * slack to 25% and kick event check.
1452 */
1453 intv = disk_events_poll_jiffies(disk);
1454 set_timer_slack(&ev->dwork.timer, intv / 4);
1455 if (check_now)
1456 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1457 else if (intv)
1458 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1459out_unlock:
1460 spin_unlock_irqrestore(&ev->lock, flags);
1461}
1462
1463/**
1464 * disk_block_events - block and flush disk event checking
1465 * @disk: disk to block events for
1466 *
1467 * On return from this function, it is guaranteed that event checking
1468 * isn't in progress and won't happen until unblocked by
1469 * disk_unblock_events(). Events blocking is counted and the actual
1470 * unblocking happens after the matching number of unblocks are done.
1471 *
1472 * Note that this intentionally does not block event checking from
1473 * disk_clear_events().
1474 *
1475 * CONTEXT:
1476 * Might sleep.
1477 */
1478void disk_block_events(struct gendisk *disk)
1479{
1480 if (disk->ev)
1481 __disk_block_events(disk, true);
1482}
1483
1484/**
1485 * disk_unblock_events - unblock disk event checking
1486 * @disk: disk to unblock events for
1487 *
1488 * Undo disk_block_events(). When the block count reaches zero, it
1489 * starts events polling if configured.
1490 *
1491 * CONTEXT:
1492 * Don't care. Safe to call from irq context.
1493 */
1494void disk_unblock_events(struct gendisk *disk)
1495{
1496 if (disk->ev)
1497 __disk_unblock_events(disk, true);
1498}
1499
1500/**
1501 * disk_check_events - schedule immediate event checking
1502 * @disk: disk to check events for
1503 *
1504 * Schedule immediate event checking on @disk if not blocked.
1505 *
1506 * CONTEXT:
1507 * Don't care. Safe to call from irq context.
1508 */
1509void disk_check_events(struct gendisk *disk)
1510{
1511 if (disk->ev) {
1512 __disk_block_events(disk, false);
1513 __disk_unblock_events(disk, true);
1514 }
1515}
1516EXPORT_SYMBOL_GPL(disk_check_events);
1517
1518/**
1519 * disk_clear_events - synchronously check, clear and return pending events
1520 * @disk: disk to fetch and clear events from
1521 * @mask: mask of events to be fetched and clearted
1522 *
1523 * Disk events are synchronously checked and pending events in @mask
1524 * are cleared and returned. This ignores the block count.
1525 *
1526 * CONTEXT:
1527 * Might sleep.
1528 */
1529unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1530{
1531 const struct block_device_operations *bdops = disk->fops;
1532 struct disk_events *ev = disk->ev;
1533 unsigned int pending;
1534
1535 if (!ev) {
1536 /* for drivers still using the old ->media_changed method */
1537 if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1538 bdops->media_changed && bdops->media_changed(disk))
1539 return DISK_EVENT_MEDIA_CHANGE;
1540 return 0;
1541 }
1542
1543 /* tell the workfn about the events being cleared */
1544 spin_lock_irq(&ev->lock);
1545 ev->clearing |= mask;
1546 spin_unlock_irq(&ev->lock);
1547
1548 /* uncondtionally schedule event check and wait for it to finish */
1549 __disk_block_events(disk, true);
1550 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1551 flush_delayed_work(&ev->dwork);
1552 __disk_unblock_events(disk, false);
1553
1554 /* then, fetch and clear pending events */
1555 spin_lock_irq(&ev->lock);
1556 WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */
1557 pending = ev->pending & mask;
1558 ev->pending &= ~mask;
1559 spin_unlock_irq(&ev->lock);
1560
1561 return pending;
1562}
1563
1564static void disk_events_workfn(struct work_struct *work)
1565{
1566 struct delayed_work *dwork = to_delayed_work(work);
1567 struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1568 struct gendisk *disk = ev->disk;
1569 char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1570 unsigned int clearing = ev->clearing;
1571 unsigned int events;
1572 unsigned long intv;
1573 int nr_events = 0, i;
1574
1575 /* check events */
1576 events = disk->fops->check_events(disk, clearing);
1577
1578 /* accumulate pending events and schedule next poll if necessary */
1579 spin_lock_irq(&ev->lock);
1580
1581 events &= ~ev->pending;
1582 ev->pending |= events;
1583 ev->clearing &= ~clearing;
1584
1585 intv = disk_events_poll_jiffies(disk);
1586 if (!ev->block && intv)
1587 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1588
1589 spin_unlock_irq(&ev->lock);
1590
1591 /* tell userland about new events */
1592 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1593 if (events & (1 << i))
1594 envp[nr_events++] = disk_uevents[i];
1595
1596 if (nr_events)
1597 kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1598}
1599
1600/*
1601 * A disk events enabled device has the following sysfs nodes under
1602 * its /sys/block/X/ directory.
1603 *
1604 * events : list of all supported events
1605 * events_async : list of events which can be detected w/o polling
1606 * events_poll_msecs : polling interval, 0: disable, -1: system default
1607 */
1608static ssize_t __disk_events_show(unsigned int events, char *buf)
1609{
1610 const char *delim = "";
1611 ssize_t pos = 0;
1612 int i;
1613
1614 for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1615 if (events & (1 << i)) {
1616 pos += sprintf(buf + pos, "%s%s",
1617 delim, disk_events_strs[i]);
1618 delim = " ";
1619 }
1620 if (pos)
1621 pos += sprintf(buf + pos, "\n");
1622 return pos;
1623}
1624
1625static ssize_t disk_events_show(struct device *dev,
1626 struct device_attribute *attr, char *buf)
1627{
1628 struct gendisk *disk = dev_to_disk(dev);
1629
1630 return __disk_events_show(disk->events, buf);
1631}
1632
1633static ssize_t disk_events_async_show(struct device *dev,
1634 struct device_attribute *attr, char *buf)
1635{
1636 struct gendisk *disk = dev_to_disk(dev);
1637
1638 return __disk_events_show(disk->async_events, buf);
1639}
1640
1641static ssize_t disk_events_poll_msecs_show(struct device *dev,
1642 struct device_attribute *attr,
1643 char *buf)
1644{
1645 struct gendisk *disk = dev_to_disk(dev);
1646
1647 return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1648}
1649
1650static ssize_t disk_events_poll_msecs_store(struct device *dev,
1651 struct device_attribute *attr,
1652 const char *buf, size_t count)
1653{
1654 struct gendisk *disk = dev_to_disk(dev);
1655 long intv;
1656
1657 if (!count || !sscanf(buf, "%ld", &intv))
1658 return -EINVAL;
1659
1660 if (intv < 0 && intv != -1)
1661 return -EINVAL;
1662
1663 __disk_block_events(disk, true);
1664 disk->ev->poll_msecs = intv;
1665 __disk_unblock_events(disk, true);
1666
1667 return count;
1668}
1669
1670static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
1671static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
1672static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
1673 disk_events_poll_msecs_show,
1674 disk_events_poll_msecs_store);
1675
1676static const struct attribute *disk_events_attrs[] = {
1677 &dev_attr_events.attr,
1678 &dev_attr_events_async.attr,
1679 &dev_attr_events_poll_msecs.attr,
1680 NULL,
1681};
1682
1683/*
1684 * The default polling interval can be specified by the kernel
1685 * parameter block.events_dfl_poll_msecs which defaults to 0
1686 * (disable). This can also be modified runtime by writing to
1687 * /sys/module/block/events_dfl_poll_msecs.
1688 */
1689static int disk_events_set_dfl_poll_msecs(const char *val,
1690 const struct kernel_param *kp)
1691{
1692 struct disk_events *ev;
1693 int ret;
1694
1695 ret = param_set_ulong(val, kp);
1696 if (ret < 0)
1697 return ret;
1698
1699 mutex_lock(&disk_events_mutex);
1700
1701 list_for_each_entry(ev, &disk_events, node)
1702 disk_check_events(ev->disk);
1703
1704 mutex_unlock(&disk_events_mutex);
1705
1706 return 0;
1707}
1708
1709static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1710 .set = disk_events_set_dfl_poll_msecs,
1711 .get = param_get_ulong,
1712};
1713
1714#undef MODULE_PARAM_PREFIX
1715#define MODULE_PARAM_PREFIX "block."
1716
1717module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1718 &disk_events_dfl_poll_msecs, 0644);
1719
1720/*
1721 * disk_{add|del|release}_events - initialize and destroy disk_events.
1722 */
1723static void disk_add_events(struct gendisk *disk)
1724{
1725 struct disk_events *ev;
1726
1727 if (!disk->fops->check_events || !(disk->events | disk->async_events))
1728 return;
1729
1730 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1731 if (!ev) {
1732 pr_warn("%s: failed to initialize events\n", disk->disk_name);
1733 return;
1734 }
1735
1736 if (sysfs_create_files(&disk_to_dev(disk)->kobj,
1737 disk_events_attrs) < 0) {
1738 pr_warn("%s: failed to create sysfs files for events\n",
1739 disk->disk_name);
1740 kfree(ev);
1741 return;
1742 }
1743
1744 disk->ev = ev;
1745
1746 INIT_LIST_HEAD(&ev->node);
1747 ev->disk = disk;
1748 spin_lock_init(&ev->lock);
1749 ev->block = 1;
1750 ev->poll_msecs = -1;
1751 INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1752
1753 mutex_lock(&disk_events_mutex);
1754 list_add_tail(&ev->node, &disk_events);
1755 mutex_unlock(&disk_events_mutex);
1756
1757 /*
1758 * Block count is initialized to 1 and the following initial
1759 * unblock kicks it into action.
1760 */
1761 __disk_unblock_events(disk, true);
1762}
1763
1764static void disk_del_events(struct gendisk *disk)
1765{
1766 if (!disk->ev)
1767 return;
1768
1769 __disk_block_events(disk, true);
1770
1771 mutex_lock(&disk_events_mutex);
1772 list_del_init(&disk->ev->node);
1773 mutex_unlock(&disk_events_mutex);
1774
1775 sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
1776}
1777
1778static void disk_release_events(struct gendisk *disk)
1779{
1780 /* the block count should be 1 from disk_del_events() */
1781 WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
1782 kfree(disk->ev);
1783}
diff --git a/block/ioctl.c b/block/ioctl.c
index a9a302eba01e..9049d460fa89 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
294 return -EINVAL; 294 return -EINVAL;
295 if (get_user(n, (int __user *) arg)) 295 if (get_user(n, (int __user *) arg))
296 return -EFAULT; 296 return -EFAULT;
297 if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0) 297 if (!(mode & FMODE_EXCL) &&
298 blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
298 return -EBUSY; 299 return -EBUSY;
299 ret = set_blocksize(bdev, n); 300 ret = set_blocksize(bdev, n);
300 if (!(mode & FMODE_EXCL)) 301 if (!(mode & FMODE_EXCL))
301 bd_release(bdev); 302 blkdev_put(bdev, mode | FMODE_EXCL);
302 return ret; 303 return ret;
303 case BLKPG: 304 case BLKPG:
304 ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); 305 ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);