13 files changed, 703 insertions, 161 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 6c9213ef15a1..60be1e0455da 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -2,7 +2,7 @@
 # Block layer core configuration
 #
 menuconfig BLOCK
-       bool "Enable the block layer" if EMBEDDED
+       bool "Enable the block layer" if EXPERT
       default y
       help
         Provide block layer support for the kernel.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1febd0f6d2a..455768a3eb9e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
                goto done;
        }
-        /* Currently we do not support hierarchy deeper than two level (0,1) */
-        if (parent != cgroup->top_cgroup)
-                return ERR_PTR(-EPERM);
        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
        if (!blkcg)
                return ERR_PTR(-ENOMEM);
diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f1b390..2f4002f79a24 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,7 +33,7 @@
 #include "blk.h"
-EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
                return;
        cpu = part_stat_lock();
-        part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-        if (!new_io)
+        if (!new_io) {
+                part = rq->part;
                part_stat_inc(cpu, part, merges[rw]);
-        else {
+        } else {
+                part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+                if (!hd_struct_try_get(part)) {
+                        /*
+                         * The partition is already being removed,
+                         * the request will be accounted on the disk only
+                         *
+                         * We take a reference on disk->part0 although that
+                         * partition will never be deleted, so we can treat
+                         * it as any other partition.
+                         */
+                        part = &rq->rq_disk->part0;
+                        hd_struct_get(part);
+                }
                part_round_stats(cpu, part);
                part_inc_in_flight(part, rw);
+                rq->part = part;
        }
        part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->ref_count = 1;
        rq->start_time = jiffies;
        set_start_time_ns(rq);
+        rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
@@ -1329,9 +1344,9 @@ static inline void blk_partition_remap(struct bio *bio)
                bio->bi_sector += p->start_sect;
                bio->bi_bdev = bdev->bd_contains;
-                trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
+                trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                    bdev->bd_dev,
+                                      bdev->bd_dev,
-                                    bio->bi_sector - p->start_sect);
+                                      bio->bi_sector - p->start_sect);
        }
 }
@@ -1500,7 +1515,7 @@ static inline void __generic_make_request(struct bio *bio)
                        goto end_io;
                if (old_sector != -1)
-                        trace_block_remap(q, bio, old_dev, old_sector);
+                        trace_block_bio_remap(q, bio, old_dev, old_sector);
                old_sector = bio->bi_sector;
                old_dev = bio->bi_bdev->bd_dev;
@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
                int cpu;
                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+                part = req->part;
                part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                part_stat_unlock();
        }
@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req)
                int cpu;
                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+                part = req->part;
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rw);
+                hd_struct_put(part);
                part_stat_unlock();
        }
 }
@@ -2606,7 +2622,9 @@ int __init blk_dev_init(void)
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
                        sizeof(((struct request *)0)->cmd_flags));
-        kblockd_workqueue = create_workqueue("kblockd");
+        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
+        kblockd_workqueue = alloc_workqueue("kblockd",
+                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 3c7a339fe381..b791022beef3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -64,7 +64,7 @@ static void cfq_exit(struct io_context *ioc)
        rcu_read_unlock();
 }
-/* Called by the exitting task */
+/* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
 {
        struct io_context *ioc;
@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task)
        task->io_context = NULL;
        task_unlock(task);
-        if (atomic_dec_and_test(&ioc->nr_tasks)) {
+        if (atomic_dec_and_test(&ioc->nr_tasks))
                cfq_exit(ioc);
-        }
        put_io_context(ioc);
 }
diff --git a/block/blk-map.c b/block/blk-map.c
index 5d5dbe47c228..e663ac2d8e68 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -201,12 +201,13 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
        for (i = 0; i < iov_count; i++) {
                unsigned long uaddr = (unsigned long)iov[i].iov_base;
+                if (!iov[i].iov_len)
+                        return -EINVAL;
                if (uaddr & queue_dma_alignment(q)) {
                        unaligned = 1;
                        break;
                }
-                if (!iov[i].iov_len)
-                        return -EINVAL;
        }
        if (unaligned || (q->dma_pad_mask & len) || map_data)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26df6b5..ea85e20d5e94 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
                return 0;
        fbio = bio;
-        cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+        cluster = blk_queue_cluster(q);
        seg_size = 0;
        nr_phys_segs = 0;
        for_each_bio(bio) {
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments);
 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
                                   struct bio *nxt)
 {
-        if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+        if (!blk_queue_cluster(q))
                return 0;
        if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
@@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
        int nsegs, cluster;
        nsegs = 0;
-        cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+        cluster = blk_queue_cluster(q);
        /*
         * for each bio in rq
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
                int cpu;
                cpu = part_stat_lock();
-                part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+                part = req->part;
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rq_data_dir(req));
+                hd_struct_put(part);
                part_stat_unlock();
        }
 }
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 701859fb9647..36c8c1f2af18 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -126,7 +126,7 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->alignment_offset = 0;
        lim->io_opt = 0;
        lim->misaligned = 0;
-        lim->no_cluster = 0;
+        lim->cluster = 1;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
@@ -229,8 +229,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 /**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
- * @q:  the request queue for the device
+ * @limits: the queue limits
 * @max_hw_sectors:  max hardware sectors in the usual 512b unit
 *
 * Description:
@@ -244,7 +244,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
 *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
 *    The soft limit can not exceed max_hw_sectors.
 **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
 {
        if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
                max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
@@ -252,9 +252,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
                       __func__, max_hw_sectors);
        }
-        q->limits.max_hw_sectors = max_hw_sectors;
+        limits->max_hw_sectors = max_hw_sectors;
-        q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
+        limits->max_sectors = min_t(unsigned int, max_hw_sectors,
-                                      BLK_DEF_MAX_SECTORS);
+                                    BLK_DEF_MAX_SECTORS);
+}
+EXPORT_SYMBOL(blk_limits_max_hw_sectors);
+/**
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * @q:  the request queue for the device
+ * @max_hw_sectors:  max hardware sectors in the usual 512b unit
+ *
+ * Description:
+ *    See description for blk_limits_max_hw_sectors().
+ **/
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+{
+        blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -464,15 +478,6 @@ EXPORT_SYMBOL(blk_queue_io_opt);
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
        blk_stack_limits(&t->limits, &b->limits, 0);
-        if (!t->queue_lock)
-                WARN_ON_ONCE(1);
-        else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
-                unsigned long flags;
-                spin_lock_irqsave(t->queue_lock, flags);
-                queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
-                spin_unlock_irqrestore(t->queue_lock, flags);
-        }
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
@@ -545,7 +550,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        t->io_min = max(t->io_min, b->io_min);
        t->io_opt = lcm(t->io_opt, b->io_opt);
-        t->no_cluster |= b->no_cluster;
+        t->cluster &= b->cluster;
        t->discard_zeroes_data &= b->discard_zeroes_data;
        /* Physical block size a multiple of the logical block size? */
@@ -641,7 +646,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
                       sector_t offset)
 {
        struct request_queue *t = disk->queue;
-        struct request_queue *b = bdev_get_queue(bdev);
        if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
                char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
@@ -652,17 +656,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
                printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
                       top, bottom);
        }
-        if (!t->queue_lock)
-                WARN_ON_ONCE(1);
-        else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
-                unsigned long flags;
-                spin_lock_irqsave(t->queue_lock, flags);
-                if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
-                        queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
-                spin_unlock_irqrestore(t->queue_lock, flags);
-        }
 }
 EXPORT_SYMBOL(disk_stack_limits);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 013457f47fdc..41fb69150b4d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -119,7 +119,7 @@ static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
-        if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+        if (blk_queue_cluster(q))
                return queue_var_show(queue_max_segment_size(q), (page));
        return queue_var_show(PAGE_CACHE_SIZE, (page));
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 56ad4531b412..381b09bb562b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -355,6 +355,12 @@ throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
                        tg->slice_end[rw], jiffies);
 }
+static inline void throtl_set_slice_end(struct throtl_data *td,
+                struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+        tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+}
 static inline void throtl_extend_slice(struct throtl_data *td,
                struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 {
@@ -391,6 +397,16 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
        if (throtl_slice_used(td, tg, rw))
                return;
+        /*
+         * A bio has been dispatched. Also adjust slice_end. It might happen
+         * that initially cgroup limit was very low resulting in high
+         * slice_end, but later limit was bumped up and bio was dispached
+         * sooner, then we need to reduce slice_end. A high bogus slice_end
+         * is bad because it does not allow new slice to start.
+         */
+        throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
        time_elapsed = jiffies - tg->slice_start[rw];
        nr_slices = time_elapsed / throtl_slice;
@@ -645,7 +661,7 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 {
        unsigned int nr_reads = 0, nr_writes = 0;
        unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-        unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+        unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
        struct bio *bio;
        /* Try to dispatch 75% READS and 25% WRITES */
@@ -709,26 +725,21 @@ static void throtl_process_limit_change(struct throtl_data *td)
        struct throtl_grp *tg;
        struct hlist_node *pos, *n;
-        /*
-         * Make sure atomic_inc() effects from
-         * throtl_update_blkio_group_read_bps(), group of functions are
-         * visible.
-         * Is this required or smp_mb__after_atomic_inc() was suffcient
-         * after the atomic_inc().
-         */
-        smp_rmb();
        if (!atomic_read(&td->limits_changed))
                return;
        throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
-        hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+        /*
-                /*
+         * Make sure updates from throtl_update_blkio_group_read_bps() group
-                 * Do I need an smp_rmb() here to make sure tg->limits_changed
+         * of functions to tg->limits_changed are visible. We do not
-                 * update is visible. I am relying on smp_rmb() at the
+         * want update td->limits_changed to be visible but update to
-                 * beginning of function and not putting a new one here.
+         * tg->limits_changed not being visible yet on this cpu. Hence
-                 */
+         * the read barrier.
+         */
+        smp_rmb();
+        hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
                if (throtl_tg_on_rr(tg) && tg->limits_changed) {
                        throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
                                " riops=%u wiops=%u", tg->bps[READ],
diff --git a/block/bsg.c b/block/bsg.c
index f20d6a789d48..0c8b64a16484 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -250,6 +250,14 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
        int ret, rw;
        unsigned int dxfer_len;
        void *dxferp = NULL;
+        struct bsg_class_device *bcd = &q->bsg_dev;
+        /* if the LLD has been removed then the bsg_unregister_queue will
+         * eventually be called and the class_dev was freed, so we can no
+         * longer use this request_queue. Return no such address.
+         */
+        if (!bcd->class_dev)
+                return ERR_PTR(-ENXIO);
        dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
                hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4cd59b0d7c15..501ffdf0399c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -87,7 +87,6 @@ struct cfq_rb_root {
        unsigned count;
        unsigned total_weight;
        u64 min_vdisktime;
-        struct rb_node *active;
 };
 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
                        .count = 0, .min_vdisktime = 0, }
@@ -97,7 +96,7 @@ struct cfq_rb_root {
 */
 struct cfq_queue {
        /* reference count */
-        atomic_t ref;
+        int ref;
        /* various state flags, see below */
        unsigned int flags;
        /* parent cfq_data */
@@ -180,7 +179,6 @@ struct cfq_group {
        /* group service_tree key */
        u64 vdisktime;
        unsigned int weight;
-        bool on_st;
        /* number of cfqq currently on this group */
        int nr_cfqq;
@@ -209,7 +207,7 @@ struct cfq_group {
        struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
        struct hlist_node cfqd_node;
-        atomic_t ref;
+        int ref;
 #endif
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
@@ -563,11 +561,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st)
        u64 vdisktime = st->min_vdisktime;
        struct cfq_group *cfqg;
-        if (st->active) {
-                cfqg = rb_entry_cfqg(st->active);
-                vdisktime = cfqg->vdisktime;
-        }
        if (st->left) {
                cfqg = rb_entry_cfqg(st->left);
                vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
@@ -605,8 +598,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
        return cfq_target_latency * cfqg->weight / st->total_weight;
 }
-static inline void
+static inline unsigned
-cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_scaled_group_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
        unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
        if (cfqd->cfq_latency) {
@@ -632,6 +625,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                                    low_slice);
                }
        }
+        return slice;
+}
+static inline void
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+        unsigned slice = cfq_scaled_group_slice(cfqd, cfqq);
        cfqq->slice_start = jiffies;
        cfqq->slice_end = jiffies + slice;
        cfqq->allocated_slice = slice;
@@ -646,11 +647,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 {
        if (cfq_cfqq_slice_new(cfqq))
-                return 0;
+                return false;
        if (time_before(jiffies, cfqq->slice_end))
-                return 0;
+                return false;
-        return 1;
+        return true;
 }
 /*
@@ -869,7 +870,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
        struct rb_node *n;
        cfqg->nr_cfqq++;
-        if (cfqg->on_st)
+        if (!RB_EMPTY_NODE(&cfqg->rb_node))
                return;
        /*
@@ -885,7 +886,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
                cfqg->vdisktime = st->min_vdisktime;
        __cfq_group_service_tree_add(st, cfqg);
-        cfqg->on_st = true;
        st->total_weight += cfqg->weight;
 }
@@ -894,9 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
-        if (st->active == &cfqg->rb_node)
-                st->active = NULL;
        BUG_ON(cfqg->nr_cfqq < 1);
        cfqg->nr_cfqq--;
@@ -905,7 +902,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
                return;
        cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-        cfqg->on_st = false;
        st->total_weight -= cfqg->weight;
        if (!RB_EMPTY_NODE(&cfqg->rb_node))
                cfq_rb_erase(&cfqg->rb_node, st);
@@ -1026,11 +1022,11 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
         * elevator which will be dropped by either elevator exit
         * or cgroup deletion path depending on who is exiting first.
         */
-        atomic_set(&cfqg->ref, 1);
+        cfqg->ref = 1;
        /*
         * Add group onto cgroup list. It might happen that bdi->dev is
-         * not initiliazed yet. Initialize this new group without major
+         * not initialized yet. Initialize this new group without major
         * and minor info and this info will be filled in once a new thread
         * comes for IO. See code above.
         */
@@ -1071,7 +1067,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
-        atomic_inc(&cfqg->ref);
+        cfqg->ref++;
        return cfqg;
 }
@@ -1083,7 +1079,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
        cfqq->cfqg = cfqg;
        /* cfqq reference on cfqg */
-        atomic_inc(&cfqq->cfqg->ref);
+        cfqq->cfqg->ref++;
 }
 static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1091,11 +1087,12 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
        struct cfq_rb_root *st;
        int i, j;
-        BUG_ON(atomic_read(&cfqg->ref) <= 0);
+        BUG_ON(cfqg->ref <= 0);
-        if (!atomic_dec_and_test(&cfqg->ref))
+        cfqg->ref--;
+        if (cfqg->ref)
                return;
        for_each_cfqg_st(cfqg, i, j, st)
-                BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
        kfree(cfqg);
 }
@@ -1200,7 +1197,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
                cfqq->orig_cfqg = cfqq->cfqg;
                cfqq->cfqg = &cfqd->root_group;
-                atomic_inc(&cfqd->root_group.ref);
+                cfqd->root_group.ref++;
                group_changed = 1;
        } else if (!cfqd->cfq_group_isolation
                   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
@@ -1672,8 +1669,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        /*
         * store what was left of this slice, if the queue idled/timed out
         */
-        if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
+        if (timed_out) {
-                cfqq->slice_resid = cfqq->slice_end - jiffies;
+                if (cfq_cfqq_slice_new(cfqq))
+                        cfqq->slice_resid = cfq_scaled_group_slice(cfqd, cfqq);
+                else
+                        cfqq->slice_resid = cfqq->slice_end - jiffies;
                cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
        }
@@ -1687,9 +1687,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (cfqq == cfqd->active_queue)
                cfqd->active_queue = NULL;
-        if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-                cfqd->grp_service_tree.active = NULL;
        if (cfqd->active_cic) {
                put_io_context(cfqd->active_cic->ioc);
                cfqd->active_cic = NULL;
@@ -1901,10 +1898,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         * in their service tree.
         */
        if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
-                return 1;
+                return true;
        cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
                        service_tree->count);
-        return 0;
+        return false;
 }
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -2040,7 +2037,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
        int process_refs, io_refs;
        io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-        process_refs = atomic_read(&cfqq->ref) - io_refs;
+        process_refs = cfqq->ref - io_refs;
        BUG_ON(process_refs < 0);
        return process_refs;
 }
@@ -2080,10 +2077,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
         */
        if (new_process_refs >= process_refs) {
                cfqq->new_cfqq = new_cfqq;
-                atomic_add(process_refs, &new_cfqq->ref);
+                new_cfqq->ref += process_refs;
        } else {
                new_cfqq->new_cfqq = cfqq;
-                atomic_add(new_process_refs, &cfqq->ref);
+                cfqq->ref += new_process_refs;
        }
 }
@@ -2116,12 +2113,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
        unsigned count;
        struct cfq_rb_root *st;
        unsigned group_slice;
+        enum wl_prio_t original_prio = cfqd->serving_prio;
-        if (!cfqg) {
-                cfqd->serving_prio = IDLE_WORKLOAD;
-                cfqd->workload_expires = jiffies + 1;
-                return;
-        }
        /* Choose next priority. RT > BE > IDLE */
        if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
@@ -2134,6 +2126,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
                return;
        }
+        if (original_prio != cfqd->serving_prio)
+                goto new_workload;
        /*
         * For RT and BE, we have to choose also the type
         * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2148,6 +2143,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
        if (count && !time_after(jiffies, cfqd->workload_expires))
                return;
+new_workload:
        /* otherwise select new workload type */
        cfqd->serving_type =
                cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@ -2199,7 +2195,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
        if (RB_EMPTY_ROOT(&st->rb))
                return NULL;
        cfqg = cfq_rb_first_group(st);
-        st->active = &cfqg->rb_node;
        update_min_vdisktime(st);
        return cfqg;
 }
@@ -2293,6 +2288,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
                goto keep_queue;
        }
+        /*
+         * This is a deep seek queue, but the device is much faster than
+         * the queue can deliver, don't idle
+         **/
+        if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
+            (cfq_cfqq_slice_new(cfqq) ||
+            (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
+                cfq_clear_cfqq_deep(cfqq);
+                cfq_clear_cfqq_idle_window(cfqq);
+        }
        if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
                cfqq = NULL;
                goto keep_queue;
@@ -2367,12 +2373,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
 {
        /* the queue hasn't finished any request, can't estimate */
        if (cfq_cfqq_slice_new(cfqq))
-                return 1;
+                return true;
        if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
                cfqq->slice_end))
-                return 1;
+                return true;
-        return 0;
+        return false;
 }
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -2538,9 +2544,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
        struct cfq_data *cfqd = cfqq->cfqd;
        struct cfq_group *cfqg, *orig_cfqg;
-        BUG_ON(atomic_read(&cfqq->ref) <= 0);
+        BUG_ON(cfqq->ref <= 0);
-        if (!atomic_dec_and_test(&cfqq->ref))
+        cfqq->ref--;
+        if (cfqq->ref)
                return;
        cfq_log_cfqq(cfqd, cfqq, "put_queue");
@@ -2843,7 +2850,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        RB_CLEAR_NODE(&cfqq->p_node);
        INIT_LIST_HEAD(&cfqq->fifo);
-        atomic_set(&cfqq->ref, 0);
+        cfqq->ref = 0;
        cfqq->cfqd = cfqd;
        cfq_mark_cfqq_prio_changed(cfqq);
@@ -2979,11 +2986,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
         * pin the queue now that it's allocated, scheduler exit will prune it
         */
        if (!is_sync && !(*async_cfqq)) {
-                atomic_inc(&cfqq->ref);
+                cfqq->ref++;
                *async_cfqq = cfqq;
        }
-        atomic_inc(&cfqq->ref);
+        cfqq->ref++;
        return cfqq;
 }
@@ -3265,6 +3272,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
        if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
                return true;
+        /* An idle queue should not be idle now for some reason */
+        if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
+                return true;
        if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
                return false;
@@ -3284,10 +3295,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+        struct cfq_queue *old_cfqq = cfqd->active_queue;
        cfq_log_cfqq(cfqd, cfqq, "preempt");
        cfq_slice_expired(cfqd, 1);
        /*
+         * workload type is changed, don't save slice, otherwise preempt
+         * doesn't happen
+         */
+        if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+                cfqq->cfqg->saved_workload_slice = 0;
+        /*
         * Put the new queue at the front of the of the current list,
         * so we know that it will be selected next.
         */
@@ -3681,13 +3701,13 @@ new_queue:
        }
        cfqq->allocated[rw]++;
-        atomic_inc(&cfqq->ref);
+        cfqq->ref++;
-        spin_unlock_irqrestore(q->queue_lock, flags);
        rq->elevator_private = cic;
        rq->elevator_private2 = cfqq;
        rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+        spin_unlock_irqrestore(q->queue_lock, flags);
        return 0;
 queue_fail:
@@ -3862,6 +3882,10 @@ static void *cfq_init_queue(struct request_queue *q)
        if (!cfqd)
                return NULL;
+        /*
+         * Don't need take queue_lock in the routine, since we are
+         * initializing the ioscheduler, and nobody is using cfqd
+         */
        cfqd->cic_index = i;
        /* Init root service tree */
@@ -3881,7 +3905,7 @@ static void *cfq_init_queue(struct request_queue *q)
         * Take a reference to root group which we never drop. This is just
         * to make sure that cfq_put_cfqg() does not try to kfree root group
         */
-        atomic_set(&cfqg->ref, 1);
+        cfqg->ref = 1;
        rcu_read_lock();
        cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
                                        (void *)cfqd, 0);
@@ -3901,7 +3925,7 @@ static void *cfq_init_queue(struct request_queue *q)
         * will not attempt to free it.
         */
        cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-        atomic_inc(&cfqd->oom_cfqq.ref);
+        cfqd->oom_cfqq.ref++;
        cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
        INIT_LIST_HEAD(&cfqd->cic_list);
diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44a72ff..6a5b772aa201 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/log2.h>
 #include "blk.h"
@@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
 static struct device_type disk_type;
+static void disk_add_events(struct gendisk *disk);
+static void disk_del_events(struct gendisk *disk);
+static void disk_release_events(struct gendisk *disk);
 /**
 * disk_get_part - get partition
 * @disk: disk to look partition from
@@ -239,7 +244,7 @@ static struct blk_major_name {
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 /* index in the above - for now: assume no multimajor ranges */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
        return major % BLKDEV_MAJOR_HASH_SIZE;
 }
@@ -502,6 +507,64 @@ static int exact_lock(dev_t devt, void *data)
        return 0;
 }
+void register_disk(struct gendisk *disk)
+{
+        struct device *ddev = disk_to_dev(disk);
+        struct block_device *bdev;
+        struct disk_part_iter piter;
+        struct hd_struct *part;
+        int err;
+        ddev->parent = disk->driverfs_dev;
+        dev_set_name(ddev, disk->disk_name);
+        /* delay uevents, until we scanned partition table */
+        dev_set_uevent_suppress(ddev, 1);
+        if (device_add(ddev))
+                return;
+        if (!sysfs_deprecated) {
+                err = sysfs_create_link(block_depr, &ddev->kobj,
+                                        kobject_name(&ddev->kobj));
+                if (err) {
+                        device_del(ddev);
+                        return;
+                }
+        }
+        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+        /* No minors to use for partitions */
+        if (!disk_partitionable(disk))
+                goto exit;
+        /* No such device (e.g., media were just removed) */
+        if (!get_capacity(disk))
+                goto exit;
+        bdev = bdget_disk(disk, 0);
+        if (!bdev)
+                goto exit;
+        bdev->bd_invalidated = 1;
+        err = blkdev_get(bdev, FMODE_READ, NULL);
+        if (err < 0)
+                goto exit;
+        blkdev_put(bdev, FMODE_READ);
+exit:
+        /* announce disk after possible partitions are created */
+        dev_set_uevent_suppress(ddev, 0);
+        kobject_uevent(&ddev->kobj, KOBJ_ADD);
+        /* announce possible partitions */
+        disk_part_iter_init(&piter, disk, 0);
+        while ((part = disk_part_iter_next(&piter)))
+                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+        disk_part_iter_exit(&piter);
+}
 /**
 * add_disk - add partitioning information to kernel list
 * @disk: per-device partitioning information
@@ -551,18 +614,48 @@ void add_disk(struct gendisk *disk)
        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
                                   "bdi");
        WARN_ON(retval);
-}
+        disk_add_events(disk);
+}
 EXPORT_SYMBOL(add_disk);
-EXPORT_SYMBOL(del_gendisk);     /* in partitions/check.c */
-void unlink_gendisk(struct gendisk *disk)
+void del_gendisk(struct gendisk *disk)
 {
+        struct disk_part_iter piter;
+        struct hd_struct *part;
+        disk_del_events(disk);
+        /* invalidate stuff */
+        disk_part_iter_init(&piter, disk,
+                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+        while ((part = disk_part_iter_next(&piter))) {
+                invalidate_partition(disk, part->partno);
+                delete_partition(disk, part->partno);
+        }
+        disk_part_iter_exit(&piter);
+        invalidate_partition(disk, 0);
+        blk_free_devt(disk_to_dev(disk)->devt);
+        set_capacity(disk, 0);
+        disk->flags &= ~GENHD_FL_UP;
        sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
        bdi_unregister(&disk->queue->backing_dev_info);
        blk_unregister_queue(disk);
        blk_unregister_region(disk_devt(disk), disk->minors);
+        part_stat_set_all(&disk->part0, 0);
+        disk->part0.stamp = 0;
+        kobject_put(disk->part0.holder_dir);
+        kobject_put(disk->slave_dir);
+        disk->driverfs_dev = NULL;
+        if (!sysfs_deprecated)
+                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+        device_del(disk_to_dev(disk));
 }
+EXPORT_SYMBOL(del_gendisk);
 /**
 * get_gendisk - get partitioning information for a given device
@@ -735,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
        static void *p;
        p = disk_seqf_start(seqf, pos);
-        if (!IS_ERR(p) && p && !*pos)
+        if (!IS_ERR_OR_NULL(p) && !*pos)
                seq_puts(seqf, "major minor  #blocks  name\n\n");
        return p;
 }
@@ -1005,6 +1098,7 @@ static void disk_release(struct device *dev)
 {
        struct gendisk *disk = dev_to_disk(dev);
+        disk_release_events(disk);
        kfree(disk->random);
        disk_replace_part_tbl(disk, NULL);
        free_part_stats(&disk->part0);
@@ -1110,29 +1204,6 @@ static int __init proc_genhd_init(void)
 module_init(proc_genhd_init);
 #endif /* CONFIG_PROC_FS */
-static void media_change_notify_thread(struct work_struct *work)
-{
-        struct gendisk *gd = container_of(work, struct gendisk, async_notify);
-        char event[] = "MEDIA_CHANGE=1";
-        char *envp[] = { event, NULL };
-        /*
-         * set enviroment vars to indicate which event this is for
-         * so that user space will know to go check the media status.
-         */
-        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
-        put_device(gd->driverfs_dev);
-}
-#if 0
-void genhd_media_change_notify(struct gendisk *disk)
-{
-        get_device(disk->driverfs_dev);
-        schedule_work(&disk->async_notify);
-}
-EXPORT_SYMBOL_GPL(genhd_media_change_notify);
-#endif  /*  0  */
 dev_t blk_lookup_devt(const char *name, int partno)
 {
        dev_t devt = MKDEV(0, 0);
@@ -1193,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
                }
                disk->part_tbl->part[0] = &disk->part0;
+                hd_ref_init(&disk->part0);
                disk->minors = minors;
                rand_initialize_disk(disk);
                disk_to_dev(disk)->class = &block_class;
                disk_to_dev(disk)->type = &disk_type;
                device_initialize(disk_to_dev(disk));
-                INIT_WORK(&disk->async_notify,
-                        media_change_notify_thread);
        }
        return disk;
 }
@@ -1291,3 +1362,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
 }
 EXPORT_SYMBOL(invalidate_partition);
+/*
+ * Disk events - monitor disk events like media change and eject request.
+ */
+struct disk_events {
+        struct list_head        node;           /* all disk_event's */
+        struct gendisk          *disk;          /* the associated disk */
+        spinlock_t              lock;
+        int                     block;          /* event blocking depth */
+        unsigned int            pending;        /* events already sent out */
+        unsigned int            clearing;       /* events being cleared */
+        long                    poll_msecs;     /* interval, -1 for default */
+        struct delayed_work     dwork;
+};
+static const char *disk_events_strs[] = {
+        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
+        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "eject_request",
+};
+static char *disk_uevents[] = {
+        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
+        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "DISK_EJECT_REQUEST=1",
+};
+/* list of all disk_events */
+static DEFINE_MUTEX(disk_events_mutex);
+static LIST_HEAD(disk_events);
+/* disable in-kernel polling by default */
+static unsigned long disk_events_dfl_poll_msecs = 0;
+static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
+{
+        struct disk_events *ev = disk->ev;
+        long intv_msecs = 0;
+        /*
+         * If device-specific poll interval is set, always use it.  If
+         * the default is being used, poll iff there are events which
+         * can't be monitored asynchronously.
+         */
+        if (ev->poll_msecs >= 0)
+                intv_msecs = ev->poll_msecs;
+        else if (disk->events & ~disk->async_events)
+                intv_msecs = disk_events_dfl_poll_msecs;
+        return msecs_to_jiffies(intv_msecs);
+}
+static void __disk_block_events(struct gendisk *disk, bool sync)
+{
+        struct disk_events *ev = disk->ev;
+        unsigned long flags;
+        bool cancel;
+        spin_lock_irqsave(&ev->lock, flags);
+        cancel = !ev->block++;
+        spin_unlock_irqrestore(&ev->lock, flags);
+        if (cancel) {
+                if (sync)
+                        cancel_delayed_work_sync(&disk->ev->dwork);
+                else
+                        cancel_delayed_work(&disk->ev->dwork);
+        }
+}
+static void __disk_unblock_events(struct gendisk *disk, bool check_now)
+{
+        struct disk_events *ev = disk->ev;
+        unsigned long intv;
+        unsigned long flags;
+        spin_lock_irqsave(&ev->lock, flags);
+        if (WARN_ON_ONCE(ev->block <= 0))
+                goto out_unlock;
+        if (--ev->block)
+                goto out_unlock;
+        /*
+         * Not exactly a latency critical operation, set poll timer
+         * slack to 25% and kick event check.
+         */
+        intv = disk_events_poll_jiffies(disk);
+        set_timer_slack(&ev->dwork.timer, intv / 4);
+        if (check_now)
+                queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+        else if (intv)
+                queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+out_unlock:
+        spin_unlock_irqrestore(&ev->lock, flags);
+}
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events().  Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
+{
+        if (disk->ev)
+                __disk_block_events(disk, true);
+}
+/**
+ * disk_unblock_events - unblock disk event checking
+ * @disk: disk to unblock events for
+ *
+ * Undo disk_block_events().  When the block count reaches zero, it
+ * starts events polling if configured.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_unblock_events(struct gendisk *disk)
+{
+        if (disk->ev)
+                __disk_unblock_events(disk, true);
+}
+/**
+ * disk_check_events - schedule immediate event checking
+ * @disk: disk to check events for
+ *
+ * Schedule immediate event checking on @disk if not blocked.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_check_events(struct gendisk *disk)
+{
+        if (disk->ev) {
+                __disk_block_events(disk, false);
+                __disk_unblock_events(disk, true);
+        }
+}
+EXPORT_SYMBOL_GPL(disk_check_events);
+/**
+ * disk_clear_events - synchronously check, clear and return pending events
+ * @disk: disk to fetch and clear events from
+ * @mask: mask of events to be fetched and clearted
+ *
+ * Disk events are synchronously checked and pending events in @mask
+ * are cleared and returned.  This ignores the block count.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+{
+        const struct block_device_operations *bdops = disk->fops;
+        struct disk_events *ev = disk->ev;
+        unsigned int pending;
+        if (!ev) {
+                /* for drivers still using the old ->media_changed method */
+                if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
+                    bdops->media_changed && bdops->media_changed(disk))
+                        return DISK_EVENT_MEDIA_CHANGE;
+                return 0;
+        }
+        /* tell the workfn about the events being cleared */
+        spin_lock_irq(&ev->lock);
+        ev->clearing |= mask;
+        spin_unlock_irq(&ev->lock);
+        /* uncondtionally schedule event check and wait for it to finish */
+        __disk_block_events(disk, true);
+        queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+        flush_delayed_work(&ev->dwork);
+        __disk_unblock_events(disk, false);
+        /* then, fetch and clear pending events */
+        spin_lock_irq(&ev->lock);
+        WARN_ON_ONCE(ev->clearing & mask);      /* cleared by workfn */
+        pending = ev->pending & mask;
+        ev->pending &= ~mask;
+        spin_unlock_irq(&ev->lock);
+        return pending;
+}
+static void disk_events_workfn(struct work_struct *work)
+{
+        struct delayed_work *dwork = to_delayed_work(work);
+        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
+        struct gendisk *disk = ev->disk;
+        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+        unsigned int clearing = ev->clearing;
+        unsigned int events;
+        unsigned long intv;
+        int nr_events = 0, i;
+        /* check events */
+        events = disk->fops->check_events(disk, clearing);
+        /* accumulate pending events and schedule next poll if necessary */
+        spin_lock_irq(&ev->lock);
+        events &= ~ev->pending;
+        ev->pending |= events;
+        ev->clearing &= ~clearing;
+        intv = disk_events_poll_jiffies(disk);
+        if (!ev->block && intv)
+                queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+        spin_unlock_irq(&ev->lock);
+        /* tell userland about new events */
+        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+                if (events & (1 << i))
+                        envp[nr_events++] = disk_uevents[i];
+        if (nr_events)
+                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+/*
+ * A disk events enabled device has the following sysfs nodes under
+ * its /sys/block/X/ directory.
+ *
+ * events               : list of all supported events
+ * events_async         : list of events which can be detected w/o polling
+ * events_poll_msecs    : polling interval, 0: disable, -1: system default
+ */
+static ssize_t __disk_events_show(unsigned int events, char *buf)
+{
+        const char *delim = "";
+        ssize_t pos = 0;
+        int i;
+        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
+                if (events & (1 << i)) {
+                        pos += sprintf(buf + pos, "%s%s",
+                                       delim, disk_events_strs[i]);
+                        delim = " ";
+                }
+        if (pos)
+                pos += sprintf(buf + pos, "\n");
+        return pos;
+}
+static ssize_t disk_events_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        return __disk_events_show(disk->events, buf);
+}
+static ssize_t disk_events_async_show(struct device *dev,
+                                      struct device_attribute *attr, char *buf)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        return __disk_events_show(disk->async_events, buf);
+}
+static ssize_t disk_events_poll_msecs_show(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
+}
+static ssize_t disk_events_poll_msecs_store(struct device *dev,
+                                            struct device_attribute *attr,
+                                            const char *buf, size_t count)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        long intv;
+        if (!count || !sscanf(buf, "%ld", &intv))
+                return -EINVAL;
+        if (intv < 0 && intv != -1)
+                return -EINVAL;
+        __disk_block_events(disk, true);
+        disk->ev->poll_msecs = intv;
+        __disk_unblock_events(disk, true);
+        return count;
+}
+static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+                         disk_events_poll_msecs_show,
+                         disk_events_poll_msecs_store);
+static const struct attribute *disk_events_attrs[] = {
+        &dev_attr_events.attr,
+        &dev_attr_events_async.attr,
+        &dev_attr_events_poll_msecs.attr,
+        NULL,
+};
+/*
+ * The default polling interval can be specified by the kernel
+ * parameter block.events_dfl_poll_msecs which defaults to 0
+ * (disable).  This can also be modified runtime by writing to
+ * /sys/module/block/events_dfl_poll_msecs.
+ */
+static int disk_events_set_dfl_poll_msecs(const char *val,
+                                          const struct kernel_param *kp)
+{
+        struct disk_events *ev;
+        int ret;
+        ret = param_set_ulong(val, kp);
+        if (ret < 0)
+                return ret;
+        mutex_lock(&disk_events_mutex);
+        list_for_each_entry(ev, &disk_events, node)
+                disk_check_events(ev->disk);
+        mutex_unlock(&disk_events_mutex);
+        return 0;
+}
+static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
+        .set    = disk_events_set_dfl_poll_msecs,
+        .get    = param_get_ulong,
+};
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX     "block."
+module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
+                &disk_events_dfl_poll_msecs, 0644);
+/*
+ * disk_{add|del|release}_events - initialize and destroy disk_events.
+ */
+static void disk_add_events(struct gendisk *disk)
+{
+        struct disk_events *ev;
+        if (!disk->fops->check_events || !(disk->events | disk->async_events))
+                return;
+        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+        if (!ev) {
+                pr_warn("%s: failed to initialize events\n", disk->disk_name);
+                return;
+        }
+        if (sysfs_create_files(&disk_to_dev(disk)->kobj,
+                               disk_events_attrs) < 0) {
+                pr_warn("%s: failed to create sysfs files for events\n",
+                        disk->disk_name);
+                kfree(ev);
+                return;
+        }
+        disk->ev = ev;
+        INIT_LIST_HEAD(&ev->node);
+        ev->disk = disk;
+        spin_lock_init(&ev->lock);
+        ev->block = 1;
+        ev->poll_msecs = -1;
+        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
+        mutex_lock(&disk_events_mutex);
+        list_add_tail(&ev->node, &disk_events);
+        mutex_unlock(&disk_events_mutex);
+        /*
+         * Block count is initialized to 1 and the following initial
+         * unblock kicks it into action.
+         */
+        __disk_unblock_events(disk, true);
+}
+static void disk_del_events(struct gendisk *disk)
+{
+        if (!disk->ev)
+                return;
+        __disk_block_events(disk, true);
+        mutex_lock(&disk_events_mutex);
+        list_del_init(&disk->ev->node);
+        mutex_unlock(&disk_events_mutex);
+        sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+}
+static void disk_release_events(struct gendisk *disk)
+{
+        /* the block count should be 1 from disk_del_events() */
+        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+        kfree(disk->ev);
+}
diff --git a/block/ioctl.c b/block/ioctl.c
index a9a302eba01e..9049d460fa89 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                        return -EINVAL;
                if (get_user(n, (int __user *) arg))
                        return -EFAULT;
-                if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
+                if (!(mode & FMODE_EXCL) &&
+                    blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
                        return -EBUSY;
                ret = set_blocksize(bdev, n);
                if (!(mode & FMODE_EXCL))
-                        bd_release(bdev);
+                        blkdev_put(bdev, mode | FMODE_EXCL);
                return ret;
        case BLKPG:
                ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);