Patched in Tegra support.

author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
commit: 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree: a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /block
parent: 406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
62 files changed, 4105 insertions, 10436 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 4a85ccf8d4c..e97934eecec 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,7 +4,6 @@
 menuconfig BLOCK
       bool "Enable the block layer" if EXPERT
       default y
-       select PERCPU_RWSEM
       help
         Provide block layer support for the kernel.
@@ -90,7 +89,7 @@ config BLK_DEV_INTEGRITY
 config BLK_DEV_THROTTLING
        bool "Block layer bio throttling support"
-        depends on BLK_CGROUP=y
+        depends on BLK_CGROUP=y && EXPERIMENTAL
        default n
        ---help---
        Block layer bio throttling support. It can be used to limit
@@ -100,12 +99,6 @@ config BLK_DEV_THROTTLING
        See Documentation/cgroups/blkio-controller.txt for more information.
-menu "Partition Types"
-source "block/partitions/Kconfig"
-endmenu
 endif # BLOCK
 config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9c4c4..3199b76f795 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,6 +23,8 @@ config IOSCHED_DEADLINE
 config IOSCHED_CFQ
        tristate "CFQ I/O scheduler"
+        # If BLK_CGROUP is a module, CFQ has to be built as module.
+        depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
        default y
        ---help---
          The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -32,6 +34,8 @@ config IOSCHED_CFQ
          This is the default I/O scheduler.
+          Note: If BLK_CGROUP=m, then CFQ can be built only as module.
 config CFQ_GROUP_IOSCHED
        bool "CFQ Group Scheduling support"
        depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/Makefile b/block/Makefile
index 39b76ba66ff..514c6e4f427 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-                        blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
+                        blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
-                        partition-generic.o partitions/
 obj-$(CONFIG_BLK_DEV_BSG)       += bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)    += bsg-lib.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b8858fb0caf..b596e54ddd7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -11,959 +11,1656 @@
 *                    Nauman Rafique <nauman@google.com>
 */
 #include <linux/ioprio.h>
+#include <linux/seq_file.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
-#include <linux/genhd.h>
-#include <linux/delay.h>
-#include <linux/atomic.h>
 #include "blk-cgroup.h"
-#include "blk.h"
+#include <linux/genhd.h>
 #define MAX_KEY_LEN 100
-static DEFINE_MUTEX(blkcg_pol_mutex);
+static DEFINE_SPINLOCK(blkio_list_lock);
+static LIST_HEAD(blkio_list);
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
-EXPORT_SYMBOL_GPL(blkcg_root);
+EXPORT_SYMBOL_GPL(blkio_root_cgroup);
-static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
+static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
+                                                  struct cgroup *);
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
+static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
-static bool blkcg_policy_enabled(struct request_queue *q,
+/* for encoding cft->private value on file */
-                                 const struct blkcg_policy *pol)
+#define BLKIOFILE_PRIVATE(x, val)       (((x) << 16) | (val))
+/* What policy owns the file, proportional or throttle */
+#define BLKIOFILE_POLICY(val)           (((val) >> 16) & 0xffff)
+#define BLKIOFILE_ATTR(val)             ((val) & 0xffff)
+struct cgroup_subsys blkio_subsys = {
+        .name = "blkio",
+        .create = blkiocg_create,
+        .can_attach_task = blkiocg_can_attach_task,
+        .attach_task = blkiocg_attach_task,
+        .destroy = blkiocg_destroy,
+        .populate = blkiocg_populate,
+#ifdef CONFIG_BLK_CGROUP
+        /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
+        .subsys_id = blkio_subsys_id,
+#endif
+        .use_id = 1,
+        .module = THIS_MODULE,
+};
+EXPORT_SYMBOL_GPL(blkio_subsys);
+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
+                                            struct blkio_policy_node *pn)
 {
-        return pol && test_bit(pol->plid, q->blkcg_pols);
+        list_add(&pn->node, &blkcg->policy_list);
 }
-/**
+static inline bool cftype_blkg_same_policy(struct cftype *cft,
- * blkg_free - free a blkg
+                        struct blkio_group *blkg)
- * @blkg: blkg to free
- *
- * Free @blkg which may be partially allocated.
- */
-static void blkg_free(struct blkcg_gq *blkg)
 {
-        int i;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-        if (!blkg)
+        if (blkg->plid == plid)
-                return;
+                return 1;
+        return 0;
+}
-        for (i = 0; i < BLKCG_MAX_POLS; i++) {
+/* Determines if policy node matches cgroup file being accessed */
-                struct blkcg_policy *pol = blkcg_policy[i];
+static inline bool pn_matches_cftype(struct cftype *cft,
-                struct blkg_policy_data *pd = blkg->pd[i];
+                        struct blkio_policy_node *pn)
+{
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int fileid = BLKIOFILE_ATTR(cft->private);
-                if (!pd)
+        return (plid == pn->plid && fileid == pn->fileid);
-                        continue;
+}
-                if (pol && pol->pd_exit_fn)
+/* Must be called with blkcg->lock held */
-                        pol->pd_exit_fn(blkg);
+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
+{
+        list_del(&pn->node);
+}
-                kfree(pd);
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
+                enum blkio_policy_id plid, int fileid)
+{
+        struct blkio_policy_node *pn;
+        list_for_each_entry(pn, &blkcg->policy_list, node) {
+                if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
+                        return pn;
        }
-        blk_exit_rl(&blkg->rl);
+        return NULL;
-        kfree(blkg);
 }
-/**
+struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
- * blkg_alloc - allocate a blkg
- * @blkcg: block cgroup the new blkg is associated with
- * @q: request_queue the new blkg is associated with
- * @gfp_mask: allocation mask to use
- *
- * Allocate a new blkg assocating @blkcg and @q.
- */
-static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
-                                   gfp_t gfp_mask)
 {
-        struct blkcg_gq *blkg;
+        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
-        int i;
+                            struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
-        /* alloc and init base part */
+struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
-        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
+{
-        if (!blkg)
+        return container_of(task_subsys_state(tsk, blkio_subsys_id),
-                return NULL;
+                            struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(task_blkio_cgroup);
-        blkg->q = q;
+static inline void
-        INIT_LIST_HEAD(&blkg->q_node);
+blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
-        blkg->blkcg = blkcg;
+{
-        blkg->refcnt = 1;
+        struct blkio_policy_type *blkiop;
-        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
+        list_for_each_entry(blkiop, &blkio_list, list) {
-        if (blkcg != &blkcg_root) {
+                /* If this policy does not own the blkg, do not send updates */
-                if (blk_init_rl(&blkg->rl, q, gfp_mask))
+                if (blkiop->plid != blkg->plid)
-                        goto err_free;
+                        continue;
-                blkg->rl.blkg = blkg;
+                if (blkiop->ops.blkio_update_group_weight_fn)
+                        blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+                                                        blkg, weight);
        }
+}
-        for (i = 0; i < BLKCG_MAX_POLS; i++) {
+static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
-                struct blkcg_policy *pol = blkcg_policy[i];
+                                int fileid)
-                struct blkg_policy_data *pd;
+{
+        struct blkio_policy_type *blkiop;
-                if (!blkcg_policy_enabled(q, pol))
+        list_for_each_entry(blkiop, &blkio_list, list) {
-                        continue;
-                /* alloc per-policy data and attach it to blkg */
+                /* If this policy does not own the blkg, do not send updates */
-                pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+                if (blkiop->plid != blkg->plid)
-                if (!pd)
+                        continue;
-                        goto err_free;
-                blkg->pd[i] = pd;
+                if (fileid == BLKIO_THROTL_read_bps_device
-                pd->blkg = blkg;
+                    && blkiop->ops.blkio_update_group_read_bps_fn)
+                        blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+                                                                blkg, bps);
-                /* invoke per-policy init */
+                if (fileid == BLKIO_THROTL_write_bps_device
-                if (blkcg_policy_enabled(blkg->q, pol))
+                    && blkiop->ops.blkio_update_group_write_bps_fn)
-                        pol->pd_init_fn(blkg);
+                        blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+                                                                blkg, bps);
        }
-        return blkg;
-err_free:
-        blkg_free(blkg);
-        return NULL;
 }
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+static inline void blkio_update_group_iops(struct blkio_group *blkg,
-                                      struct request_queue *q)
+                        unsigned int iops, int fileid)
 {
-        struct blkcg_gq *blkg;
+        struct blkio_policy_type *blkiop;
-        blkg = rcu_dereference(blkcg->blkg_hint);
+        list_for_each_entry(blkiop, &blkio_list, list) {
-        if (blkg && blkg->q == q)
-                return blkg;
-        /*
+                /* If this policy does not own the blkg, do not send updates */
-         * Hint didn't match.  Look up from the radix tree.  Note that we
+                if (blkiop->plid != blkg->plid)
-         * may not be holding queue_lock and thus are not sure whether
+                        continue;
-         * @blkg from blkg_tree has already been removed or not, so we
-         * can't update hint to the lookup result.  Leave it to the caller.
-         */
-        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
-        if (blkg && blkg->q == q)
-                return blkg;
-        return NULL;
+                if (fileid == BLKIO_THROTL_read_iops_device
+                    && blkiop->ops.blkio_update_group_read_iops_fn)
+                        blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+                                                                blkg, iops);
+                if (fileid == BLKIO_THROTL_write_iops_device
+                    && blkiop->ops.blkio_update_group_write_iops_fn)
+                        blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+                                                                blkg,iops);
+        }
 }
-/**
+/*
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * Add to the appropriate stat variable depending on the request type.
- * @blkcg: blkcg of interest
+ * This should be called with the blkg->stats_lock held.
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
 */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+                                bool sync)
 {
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        if (direction)
+                stat[BLKIO_STAT_WRITE] += add;
-        if (unlikely(blk_queue_bypass(q)))
+        else
-                return NULL;
+                stat[BLKIO_STAT_READ] += add;
-        return __blkg_lookup(blkcg, q);
+        if (sync)
+                stat[BLKIO_STAT_SYNC] += add;
+        else
+                stat[BLKIO_STAT_ASYNC] += add;
 }
-EXPORT_SYMBOL_GPL(blkg_lookup);
 /*
- * If @new_blkg is %NULL, this function tries to allocate a new one as
+ * Decrements the appropriate stat variable if non-zero depending on the
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * request type. Panics on value being zero.
+ * This should be called with the blkg->stats_lock held.
 */
-static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
-                                             struct request_queue *q,
-                                             struct blkcg_gq *new_blkg)
 {
-        struct blkcg_gq *blkg;
+        if (direction) {
-        int ret;
+                BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
+                stat[BLKIO_STAT_WRITE]--;
+        } else {
+                BUG_ON(stat[BLKIO_STAT_READ] == 0);
+                stat[BLKIO_STAT_READ]--;
+        }
+        if (sync) {
+                BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
+                stat[BLKIO_STAT_SYNC]--;
+        } else {
+                BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
+                stat[BLKIO_STAT_ASYNC]--;
+        }
+}
-        WARN_ON_ONCE(!rcu_read_lock_held());
+#ifdef CONFIG_DEBUG_BLK_CGROUP
-        lockdep_assert_held(q->queue_lock);
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+                                                struct blkio_group *curr_blkg)
+{
+        if (blkio_blkg_waiting(&blkg->stats))
+                return;
+        if (blkg == curr_blkg)
+                return;
+        blkg->stats.start_group_wait_time = sched_clock();
+        blkio_mark_blkg_waiting(&blkg->stats);
+}
-        /* lookup and update hint on success, see __blkg_lookup() for details */
+/* This should be called with the blkg->stats_lock held. */
-        blkg = __blkg_lookup(blkcg, q);
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
-        if (blkg) {
+{
-                rcu_assign_pointer(blkcg->blkg_hint, blkg);
+        unsigned long long now;
-                goto out_free;
-        }
-        /* blkg holds a reference to blkcg */
+        if (!blkio_blkg_waiting(stats))
-        if (!css_tryget(&blkcg->css)) {
+                return;
-                blkg = ERR_PTR(-EINVAL);
-                goto out_free;
-        }
-        /* allocate */
+        now = sched_clock();
-        if (!new_blkg) {
+        if (time_after64(now, stats->start_group_wait_time))
-                new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+                stats->group_wait_time += now - stats->start_group_wait_time;
-                if (unlikely(!new_blkg)) {
+        blkio_clear_blkg_waiting(stats);
-                        blkg = ERR_PTR(-ENOMEM);
+}
-                        goto out_put;
-                }
-        }
-        blkg = new_blkg;
-        /* insert */
+/* This should be called with the blkg->stats_lock held. */
-        spin_lock(&blkcg->lock);
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
-        ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
+{
-        if (likely(!ret)) {
+        unsigned long long now;
-                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-                list_add(&blkg->q_node, &q->blkg_list);
-        }
-        spin_unlock(&blkcg->lock);
-        if (!ret)
+        if (!blkio_blkg_empty(stats))
-                return blkg;
+                return;
-        blkg = ERR_PTR(ret);
+        now = sched_clock();
-out_put:
+        if (time_after64(now, stats->start_empty_time))
-        css_put(&blkcg->css);
+                stats->empty_time += now - stats->start_empty_time;
-out_free:
+        blkio_clear_blkg_empty(stats);
-        blkg_free(new_blkg);
-        return blkg;
 }
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
-                                    struct request_queue *q)
 {
-        /*
+        unsigned long flags;
-         * This could be the first entry point of blkcg implementation and
-         * we shouldn't allow anything to go through for a bypassing queue.
+        spin_lock_irqsave(&blkg->stats_lock, flags);
-         */
+        BUG_ON(blkio_blkg_idling(&blkg->stats));
-        if (unlikely(blk_queue_bypass(q)))
+        blkg->stats.start_idle_time = sched_clock();
-                return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+        blkio_mark_blkg_idling(&blkg->stats);
-        return __blkg_lookup_create(blkcg, q, NULL);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
-static void blkg_destroy(struct blkcg_gq *blkg)
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
 {
-        struct blkcg *blkcg = blkg->blkcg;
+        unsigned long flags;
+        unsigned long long now;
+        struct blkio_group_stats *stats;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        if (blkio_blkg_idling(stats)) {
+                now = sched_clock();
+                if (time_after64(now, stats->start_idle_time))
+                        stats->idle_time += now - stats->start_idle_time;
+                blkio_clear_blkg_idling(stats);
+        }
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
-        lockdep_assert_held(blkg->q->queue_lock);
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
-        lockdep_assert_held(&blkcg->lock);
+{
+        unsigned long flags;
+        struct blkio_group_stats *stats;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        stats->avg_queue_size_sum +=
+                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+        stats->avg_queue_size_samples++;
+        blkio_update_group_wait_time(stats);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
-        /* Something wrong if we are trying to remove same group twice */
+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
-        WARN_ON_ONCE(list_empty(&blkg->q_node));
+{
-        WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
+        unsigned long flags;
+        struct blkio_group_stats *stats;
-        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
+        spin_lock_irqsave(&blkg->stats_lock, flags);
-        list_del_init(&blkg->q_node);
+        stats = &blkg->stats;
-        hlist_del_init_rcu(&blkg->blkcg_node);
-        /*
+        if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
-         * Both setting lookup hint to and clearing it from @blkg are done
+                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
-         * under queue_lock.  If it's not pointing to @blkg now, it never
+                spin_unlock_irqrestore(&blkg->stats_lock, flags);
-         * will.  Hint assignment itself can race safely.
+                return;
-         */
+        }
-        if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
-                rcu_assign_pointer(blkcg->blkg_hint, NULL);
        /*
-         * Put the reference taken at the time of creation so that when all
+         * group is already marked empty. This can happen if cfqq got new
-         * queues are gone, group can be destroyed.
+         * request in parent group and moved to this group while being added
+         * to service tree. Just ignore the event and move on.
         */
-        blkg_put(blkg);
+        if(blkio_blkg_empty(stats)) {
+                spin_unlock_irqrestore(&blkg->stats_lock, flags);
+                return;
+        }
+        stats->start_empty_time = sched_clock();
+        blkio_mark_blkg_empty(stats);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
-/**
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
- * blkg_destroy_all - destroy all blkgs associated with a request_queue
+                        unsigned long dequeue)
- * @q: request_queue of interest
+{
- *
+        blkg->stats.dequeue += dequeue;
- * Destroy all blkgs associated with @q.
+}
- */
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
-static void blkg_destroy_all(struct request_queue *q)
+#else
+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+                                        struct blkio_group *curr_blkg) {}
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
+#endif
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+                        struct blkio_group *curr_blkg, bool direction,
+                        bool sync)
 {
-        struct blkcg_gq *blkg, *n;
+        unsigned long flags;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+                        sync);
+        blkio_end_empty_time(&blkg->stats);
+        blkio_set_start_group_wait_time(blkg, curr_blkg);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
-        lockdep_assert_held(q->queue_lock);
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                                bool direction, bool sync)
+{
+        unsigned long flags;
-        list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+        spin_lock_irqsave(&blkg->stats_lock, flags);
-                struct blkcg *blkcg = blkg->blkcg;
+        blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+                                        direction, sync);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
-                spin_lock(&blkcg->lock);
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
-                blkg_destroy(blkg);
+                                unsigned long unaccounted_time)
-                spin_unlock(&blkcg->lock);
+{
-        }
+        unsigned long flags;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        blkg->stats.time += time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        blkg->stats.unaccounted_time += unaccounted_time;
+#endif
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+/*
+ * should be called under rcu read lock or queue lock to make sure blkg pointer
+ * is valid.
+ */
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+                                uint64_t bytes, bool direction, bool sync)
+{
+        struct blkio_group_stats_cpu *stats_cpu;
+        unsigned long flags;
        /*
-         * root blkg is destroyed.  Just clear the pointer since
+         * Disabling interrupts to provide mutual exclusion between two
-         * root_rl does not take reference on root blkg.
+         * writes on same cpu. It probably is not needed for 64bit. Not
+         * optimizing that case yet.
         */
-        q->root_blkg = NULL;
+        local_irq_save(flags);
-        q->root_rl.blkg = NULL;
+        stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+        u64_stats_update_begin(&stats_cpu->syncp);
+        stats_cpu->sectors += bytes >> 9;
+        blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
+                        1, direction, sync);
+        blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
+                        bytes, direction, sync);
+        u64_stats_update_end(&stats_cpu->syncp);
+        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
-static void blkg_rcu_free(struct rcu_head *rcu_head)
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+        uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 {
-        blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
+        struct blkio_group_stats *stats;
+        unsigned long flags;
+        unsigned long long now = sched_clock();
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        if (time_after64(now, io_start_time))
+                blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+                                now - io_start_time, direction, sync);
+        if (time_after64(io_start_time, start_time))
+                blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+                                io_start_time - start_time, direction, sync);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
-void __blkg_release(struct blkcg_gq *blkg)
+/*  Merged stats are per cpu.  */
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+                                        bool sync)
 {
-        /* release the extra blkcg reference this blkg has been holding */
+        struct blkio_group_stats_cpu *stats_cpu;
-        css_put(&blkg->blkcg->css);
+        unsigned long flags;
        /*
-         * A group is freed in rcu manner. But having an rcu lock does not
+         * Disabling interrupts to provide mutual exclusion between two
-         * mean that one can access all the fields of blkg and assume these
+         * writes on same cpu. It probably is not needed for 64bit. Not
-         * are valid. For example, don't try to follow throtl_data and
+         * optimizing that case yet.
-         * request queue links.
-         *
-         * Having a reference to blkg under an rcu allows acess to only
-         * values local to groups like group stats and group rate limits
         */
-        call_rcu(&blkg->rcu_head, blkg_rcu_free);
+        local_irq_save(flags);
+        stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+        u64_stats_update_begin(&stats_cpu->syncp);
+        blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
+                                direction, sync);
+        u64_stats_update_end(&stats_cpu->syncp);
+        local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(__blkg_release);
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 /*
- * The next function used by blk_queue_for_each_rl().  It's a bit tricky
+ * This function allocates the per cpu stats for blkio_group. Should be called
- * because the root blkg uses @q->root_rl instead of its own rl.
+ * from sleepable context as alloc_per_cpu() requires that.
 */
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
+int blkio_alloc_blkg_stats(struct blkio_group *blkg)
-                                         struct request_queue *q)
 {
-        struct list_head *ent;
+        /* Allocate memory for per cpu stats */
-        struct blkcg_gq *blkg;
+        blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+        if (!blkg->stats_cpu)
+                return -ENOMEM;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
-        /*
+void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-         * Determine the current blkg list_head.  The first entry is
+                struct blkio_group *blkg, void *key, dev_t dev,
-         * root_rl which is off @q->blkg_list and mapped to the head.
+                enum blkio_policy_id plid)
-         */
+{
-        if (rl == &q->root_rl) {
+        unsigned long flags;
-                ent = &q->blkg_list;
-                /* There are no more block groups, hence no request lists */
+        spin_lock_irqsave(&blkcg->lock, flags);
-                if (list_empty(ent))
+        spin_lock_init(&blkg->stats_lock);
-                        return NULL;
+        rcu_assign_pointer(blkg->key, key);
-        } else {
+        blkg->blkcg_id = css_id(&blkcg->css);
-                blkg = container_of(rl, struct blkcg_gq, rl);
+        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-                ent = &blkg->q_node;
+        blkg->plid = plid;
-        }
+        spin_unlock_irqrestore(&blkcg->lock, flags);
+        /* Need to take css reference ? */
+        cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
+        blkg->dev = dev;
+}
+EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
+static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+        hlist_del_init_rcu(&blkg->blkcg_node);
+        blkg->blkcg_id = 0;
+}
-        /* walk to the next list_head, skip root blkcg */
+/*
-        ent = ent->next;
+ * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
-        if (ent == &q->root_blkg->q_node)
+ * indicating that blk_group was unhashed by the time we got to it.
-                ent = ent->next;
+ */
-        if (ent == &q->blkg_list)
+int blkiocg_del_blkio_group(struct blkio_group *blkg)
-                return NULL;
+{
+        struct blkio_cgroup *blkcg;
+        unsigned long flags;
+        struct cgroup_subsys_state *css;
+        int ret = 1;
+        rcu_read_lock();
+        css = css_lookup(&blkio_subsys, blkg->blkcg_id);
+        if (css) {
+                blkcg = container_of(css, struct blkio_cgroup, css);
+                spin_lock_irqsave(&blkcg->lock, flags);
+                if (!hlist_unhashed(&blkg->blkcg_node)) {
+                        __blkiocg_del_blkio_group(blkg);
+                        ret = 0;
+                }
+                spin_unlock_irqrestore(&blkcg->lock, flags);
+        }
-        blkg = container_of(ent, struct blkcg_gq, q_node);
+        rcu_read_unlock();
-        return &blkg->rl;
+        return ret;
 }
+EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
-static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
+/* called under rcu_read_lock(). */
-                             u64 val)
+struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+        struct blkio_group *blkg;
-        struct blkcg_gq *blkg;
        struct hlist_node *n;
-        int i;
+        void *__key;
-        mutex_lock(&blkcg_pol_mutex);
+        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
-        spin_lock_irq(&blkcg->lock);
+                __key = blkg->key;
+                if (__key == key)
+                        return blkg;
+        }
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
+static void blkio_reset_stats_cpu(struct blkio_group *blkg)
+{
+        struct blkio_group_stats_cpu *stats_cpu;
+        int i, j, k;
        /*
-         * Note that stat reset is racy - it doesn't synchronize against
+         * Note: On 64 bit arch this should not be an issue. This has the
-         * stat updates.  This is a debug feature which shouldn't exist
+         * possibility of returning some inconsistent value on 32bit arch
-         * anyway.  If you get hit by a race, retry.
+         * as 64bit update on 32bit is non atomic. Taking care of this
+         * corner case makes code very complicated, like sending IPIs to
+         * cpus, taking care of stats of offline cpus etc.
+         *
+         * reset stats is anyway more of a debug feature and this sounds a
+         * corner case. So I am not complicating the code yet until and
+         * unless this becomes a real issue.
         */
-        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+        for_each_possible_cpu(i) {
-                for (i = 0; i < BLKCG_MAX_POLS; i++) {
+                stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
-                        struct blkcg_policy *pol = blkcg_policy[i];
+                stats_cpu->sectors = 0;
+                for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
+                        for (k = 0; k < BLKIO_STAT_TOTAL; k++)
+                                stats_cpu->stat_arr_cpu[j][k] = 0;
+        }
+}
-                        if (blkcg_policy_enabled(blkg->q, pol) &&
+static int
-                            pol->pd_reset_stats_fn)
+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
-                                pol->pd_reset_stats_fn(blkg);
+{
+        struct blkio_cgroup *blkcg;
+        struct blkio_group *blkg;
+        struct blkio_group_stats *stats;
+        struct hlist_node *n;
+        uint64_t queued[BLKIO_STAT_TOTAL];
+        int i;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        bool idling, waiting, empty;
+        unsigned long long now = sched_clock();
+#endif
+        blkcg = cgroup_to_blkio_cgroup(cgroup);
+        spin_lock_irq(&blkcg->lock);
+        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                spin_lock(&blkg->stats_lock);
+                stats = &blkg->stats;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+                idling = blkio_blkg_idling(stats);
+                waiting = blkio_blkg_waiting(stats);
+                empty = blkio_blkg_empty(stats);
+#endif
+                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+                        queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
+                memset(stats, 0, sizeof(struct blkio_group_stats));
+                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+                        stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+                if (idling) {
+                        blkio_mark_blkg_idling(stats);
+                        stats->start_idle_time = now;
+                }
+                if (waiting) {
+                        blkio_mark_blkg_waiting(stats);
+                        stats->start_group_wait_time = now;
                }
+                if (empty) {
+                        blkio_mark_blkg_empty(stats);
+                        stats->start_empty_time = now;
+                }
+#endif
+                spin_unlock(&blkg->stats_lock);
+                /* Reset Per cpu stats which don't take blkg->stats_lock */
+                blkio_reset_stats_cpu(blkg);
        }
        spin_unlock_irq(&blkcg->lock);
-        mutex_unlock(&blkcg_pol_mutex);
        return 0;
 }
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
+                                int chars_left, bool diskname_only)
 {
-        /* some drivers (floppy) instantiate a queue w/o disk registered */
+        snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
-        if (blkg->q->backing_dev_info.dev)
+        chars_left -= strlen(str);
-                return dev_name(blkg->q->backing_dev_info.dev);
+        if (chars_left <= 0) {
-        return NULL;
+                printk(KERN_WARNING
+                        "Possibly incorrect cgroup stat display format");
+                return;
+        }
+        if (diskname_only)
+                return;
+        switch (type) {
+        case BLKIO_STAT_READ:
+                strlcat(str, " Read", chars_left);
+                break;
+        case BLKIO_STAT_WRITE:
+                strlcat(str, " Write", chars_left);
+                break;
+        case BLKIO_STAT_SYNC:
+                strlcat(str, " Sync", chars_left);
+                break;
+        case BLKIO_STAT_ASYNC:
+                strlcat(str, " Async", chars_left);
+                break;
+        case BLKIO_STAT_TOTAL:
+                strlcat(str, " Total", chars_left);
+                break;
+        default:
+                strlcat(str, " Invalid", chars_left);
+        }
 }
-/**
+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
- * blkcg_print_blkgs - helper for printing per-blkg data
+                                struct cgroup_map_cb *cb, dev_t dev)
- * @sf: seq_file to print to
- * @blkcg: blkcg of interest
- * @prfill: fill function to print out a blkg
- * @pol: policy in question
- * @data: data to be passed to @prfill
- * @show_total: to print out sum of prfill return values or not
- *
- * This function invokes @prfill on each blkg of @blkcg if pd for the
- * policy specified by @pol exists.  @prfill is invoked with @sf, the
- * policy data and @data.  If @show_total is %true, the sum of the return
- * values from @prfill is printed with "Total" label at the end.
- *
- * This is to be used to construct print functions for
- * cftype->read_seq_string method.
- */
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-                       u64 (*prfill)(struct seq_file *,
-                                     struct blkg_policy_data *, int),
-                       const struct blkcg_policy *pol, int data,
-                       bool show_total)
 {
-        struct blkcg_gq *blkg;
+        blkio_get_key_name(0, dev, str, chars_left, true);
-        struct hlist_node *n;
+        cb->fill(cb, str, val);
-        u64 total = 0;
+        return val;
+}
-        spin_lock_irq(&blkcg->lock);
-        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
-                if (blkcg_policy_enabled(blkg->q, pol))
-                        total += prfill(sf, blkg->pd[pol->plid], data);
-        spin_unlock_irq(&blkcg->lock);
-        if (show_total)
+static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
-                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
+                        enum stat_type_cpu type, enum stat_sub_type sub_type)
+{
+        int cpu;
+        struct blkio_group_stats_cpu *stats_cpu;
+        u64 val = 0, tval;
+        for_each_possible_cpu(cpu) {
+                unsigned int start;
+                stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
+                do {
+                        start = u64_stats_fetch_begin(&stats_cpu->syncp);
+                        if (type == BLKIO_STAT_CPU_SECTORS)
+                                tval = stats_cpu->sectors;
+                        else
+                                tval = stats_cpu->stat_arr_cpu[type][sub_type];
+                } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
+                val += tval;
+        }
+        return val;
 }
-EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
-/**
+static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
- * __blkg_prfill_u64 - prfill helper for a single u64 value
+                struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @v: value to print
- *
- * Print @v to @sf for the device assocaited with @pd.
- */
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 {
-        const char *dname = blkg_dev_name(pd->blkg);
+        uint64_t disk_total, val;
+        char key_str[MAX_KEY_LEN];
+        enum stat_sub_type sub_type;
-        if (!dname)
+        if (type == BLKIO_STAT_CPU_SECTORS) {
-                return 0;
+                val = blkio_read_stat_cpu(blkg, type, 0);
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
+        }
-        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
+        for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
-        return v;
+                        sub_type++) {
+                blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+                val = blkio_read_stat_cpu(blkg, type, sub_type);
+                cb->fill(cb, key_str, val);
+        }
+        disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
+                        blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
+        blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+        cb->fill(cb, key_str, disk_total);
+        return disk_total;
 }
-EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
-/**
+/* This should be called with blkg->stats_lock held */
- * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
+static uint64_t blkio_get_stat(struct blkio_group *blkg,
- * @sf: seq_file to print to
+                struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
- * @pd: policy private data of interest
+{
- * @rwstat: rwstat to print
+        uint64_t disk_total;
- *
+        char key_str[MAX_KEY_LEN];
- * Print @rwstat to @sf for the device assocaited with @pd.
+        enum stat_sub_type sub_type;
- */
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+        if (type == BLKIO_STAT_TIME)
-                         const struct blkg_rwstat *rwstat)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-{
+                                        blkg->stats.time, cb, dev);
-        static const char *rwstr[] = {
+#ifdef CONFIG_DEBUG_BLK_CGROUP
-                [BLKG_RWSTAT_READ]      = "Read",
+        if (type == BLKIO_STAT_UNACCOUNTED_TIME)
-                [BLKG_RWSTAT_WRITE]     = "Write",
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-                [BLKG_RWSTAT_SYNC]      = "Sync",
+                                        blkg->stats.unaccounted_time, cb, dev);
-                [BLKG_RWSTAT_ASYNC]     = "Async",
+        if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
-        };
+                uint64_t sum = blkg->stats.avg_queue_size_sum;
-        const char *dname = blkg_dev_name(pd->blkg);
+                uint64_t samples = blkg->stats.avg_queue_size_samples;
-        u64 v;
+                if (samples)
-        int i;
+                        do_div(sum, samples);
+                else
+                        sum = 0;
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
+        }
+        if (type == BLKIO_STAT_GROUP_WAIT_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.group_wait_time, cb, dev);
+        if (type == BLKIO_STAT_IDLE_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.idle_time, cb, dev);
+        if (type == BLKIO_STAT_EMPTY_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.empty_time, cb, dev);
+        if (type == BLKIO_STAT_DEQUEUE)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.dequeue, cb, dev);
+#endif
+        for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+                        sub_type++) {
+                blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+                cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
+        }
+        disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
+                        blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+        blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+        cb->fill(cb, key_str, disk_total);
+        return disk_total;
+}
-        if (!dname)
+static int blkio_check_dev_num(dev_t dev)
-                return 0;
+{
+        int part = 0;
+        struct gendisk *disk;
-        for (i = 0; i < BLKG_RWSTAT_NR; i++)
+        disk = get_gendisk(dev, &part);
-                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
+        if (!disk || part)
-                           (unsigned long long)rwstat->cnt[i]);
+                return -ENODEV;
-        v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+        return 0;
-        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
-        return v;
 }
-/**
+static int blkio_policy_parse_and_set(char *buf,
- * blkg_prfill_stat - prfill callback for blkg_stat
+        struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
- *
- * prfill callback for printing a blkg_stat.
- */
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 {
-        return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
+        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+        int ret;
+        unsigned long major, minor;
+        int i = 0;
+        dev_t dev;
+        u64 temp;
+        memset(s, 0, sizeof(s));
+        while ((p = strsep(&buf, " ")) != NULL) {
+                if (!*p)
+                        continue;
+                s[i++] = p;
+                /* Prevent from inputing too many things */
+                if (i == 3)
+                        break;
+        }
+        if (i != 2)
+                return -EINVAL;
+        p = strsep(&s[0], ":");
+        if (p != NULL)
+                major_s = p;
+        else
+                return -EINVAL;
+        minor_s = s[0];
+        if (!minor_s)
+                return -EINVAL;
+        ret = strict_strtoul(major_s, 10, &major);
+        if (ret)
+                return -EINVAL;
+        ret = strict_strtoul(minor_s, 10, &minor);
+        if (ret)
+                return -EINVAL;
+        dev = MKDEV(major, minor);
+        ret = strict_strtoull(s[1], 10, &temp);
+        if (ret)
+                return -EINVAL;
+        /* For rule removal, do not check for device presence. */
+        if (temp) {
+                ret = blkio_check_dev_num(dev);
+                if (ret)
+                        return ret;
+        }
+        newpn->dev = dev;
+        switch (plid) {
+        case BLKIO_POLICY_PROP:
+                if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+                     temp > BLKIO_WEIGHT_MAX)
+                        return -EINVAL;
+                newpn->plid = plid;
+                newpn->fileid = fileid;
+                newpn->val.weight = temp;
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        newpn->plid = plid;
+                        newpn->fileid = fileid;
+                        newpn->val.bps = temp;
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        if (temp > THROTL_IOPS_MAX)
+                                return -EINVAL;
+                        newpn->plid = plid;
+                        newpn->fileid = fileid;
+                        newpn->val.iops = (unsigned int)temp;
+                        break;
+                }
+                break;
+        default:
+                BUG();
+        }
+        return 0;
 }
-EXPORT_SYMBOL_GPL(blkg_prfill_stat);
-/**
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
- * blkg_prfill_rwstat - prfill callback for blkg_rwstat
+                              dev_t dev)
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @off: offset to the blkg_rwstat in @pd
- *
- * prfill callback for printing a blkg_rwstat.
- */
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-                       int off)
 {
-        struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
+        struct blkio_policy_node *pn;
-        return __blkg_prfill_rwstat(sf, pd, &rwstat);
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
+                                BLKIO_PROP_weight_device);
+        if (pn)
+                return pn->val.weight;
+        else
+                return blkcg->weight;
 }
-EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
-/**
+uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
- * blkg_conf_prep - parse and prepare for per-blkg config update
- * @blkcg: target block cgroup
- * @pol: target policy
- * @input: input string
- * @ctx: blkg_conf_ctx to be filled
- *
- * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
- */
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                   const char *input, struct blkg_conf_ctx *ctx)
-        __acquires(rcu) __acquires(disk->queue->queue_lock)
 {
-        struct gendisk *disk;
+        struct blkio_policy_node *pn;
-        struct blkcg_gq *blkg;
-        unsigned int major, minor;
-        unsigned long long v;
-        int part, ret;
-        if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-                return -EINVAL;
+                                BLKIO_THROTL_read_bps_device);
+        if (pn)
+                return pn->val.bps;
+        else
+                return -1;
+}
-        disk = get_gendisk(MKDEV(major, minor), &part);
+uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
-        if (!disk || part)
+{
-                return -EINVAL;
+        struct blkio_policy_node *pn;
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_write_bps_device);
+        if (pn)
+                return pn->val.bps;
+        else
+                return -1;
+}
-        rcu_read_lock();
+unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
-        spin_lock_irq(disk->queue->queue_lock);
+{
+        struct blkio_policy_node *pn;
-        if (blkcg_policy_enabled(disk->queue, pol))
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-                blkg = blkg_lookup_create(blkcg, disk->queue);
+                                BLKIO_THROTL_read_iops_device);
+        if (pn)
+                return pn->val.iops;
        else
-                blkg = ERR_PTR(-EINVAL);
+                return -1;
+}
-        if (IS_ERR(blkg)) {
+unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
-                ret = PTR_ERR(blkg);
+{
-                rcu_read_unlock();
+        struct blkio_policy_node *pn;
-                spin_unlock_irq(disk->queue->queue_lock);
+        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-                put_disk(disk);
+                                BLKIO_THROTL_write_iops_device);
-                /*
+        if (pn)
-                 * If queue was bypassing, we should retry.  Do so after a
+                return pn->val.iops;
-                 * short msleep().  It isn't strictly necessary but queue
+        else
-                 * can be bypassing for some time and it's always nice to
+                return -1;
-                 * avoid busy looping.
+}
-                 */
-                if (ret == -EBUSY) {
+/* Checks whether user asked for deleting a policy rule */
-                        msleep(10);
+static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
-                        ret = restart_syscall();
+{
+        switch(pn->plid) {
+        case BLKIO_POLICY_PROP:
+                if (pn->val.weight == 0)
+                        return 1;
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(pn->fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        if (pn->val.bps == 0)
+                                return 1;
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        if (pn->val.iops == 0)
+                                return 1;
                }
-                return ret;
+                break;
+        default:
+                BUG();
        }
-        ctx->disk = disk;
-        ctx->blkg = blkg;
-        ctx->v = v;
        return 0;
 }
-EXPORT_SYMBOL_GPL(blkg_conf_prep);
-/**
+static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
- * blkg_conf_finish - finish up per-blkg config update
+                                        struct blkio_policy_node *newpn)
- * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
- *
- * Finish up after per-blkg config update.  This function must be paired
- * with blkg_conf_prep().
- */
-void blkg_conf_finish(struct blkg_conf_ctx *ctx)
-        __releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
-        spin_unlock_irq(ctx->disk->queue->queue_lock);
+        switch(oldpn->plid) {
-        rcu_read_unlock();
+        case BLKIO_POLICY_PROP:
-        put_disk(ctx->disk);
+                oldpn->val.weight = newpn->val.weight;
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(newpn->fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        oldpn->val.bps = newpn->val.bps;
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        oldpn->val.iops = newpn->val.iops;
+                }
+                break;
+        default:
+                BUG();
+        }
 }
-EXPORT_SYMBOL_GPL(blkg_conf_finish);
-struct cftype blkcg_files[] = {
+/*
-        {
+ * Some rules/values in blkg have changed. Propagate those to respective
-                .name = "reset_stats",
+ * policies.
-                .write_u64 = blkcg_reset_stats,
+ */
-        },
+static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
-        { }     /* terminate */
+                struct blkio_group *blkg, struct blkio_policy_node *pn)
-};
+{
+        unsigned int weight, iops;
+        u64 bps;
+        switch(pn->plid) {
+        case BLKIO_POLICY_PROP:
+                weight = pn->val.weight ? pn->val.weight :
+                                blkcg->weight;
+                blkio_update_group_weight(blkg, weight);
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(pn->fileid) {
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                        bps = pn->val.bps ? pn->val.bps : (-1);
+                        blkio_update_group_bps(blkg, bps, pn->fileid);
+                        break;
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        iops = pn->val.iops ? pn->val.iops : (-1);
+                        blkio_update_group_iops(blkg, iops, pn->fileid);
+                        break;
+                }
+                break;
+        default:
+                BUG();
+        }
+}
-/**
+/*
- * blkcg_css_offline - cgroup css_offline callback
+ * A policy node rule has been updated. Propagate this update to all the
- * @cgroup: cgroup of interest
+ * block groups which might be affected by this update.
- *
- * This function is called when @cgroup is about to go away and responsible
- * for shooting down all blkgs associated with @cgroup.  blkgs should be
- * removed while holding both q and blkcg locks.  As blkcg lock is nested
- * inside q lock, this function performs reverse double lock dancing.
- *
- * This is the blkcg counterpart of ioc_release_fn().
 */
-static void blkcg_css_offline(struct cgroup *cgroup)
+static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
+                                struct blkio_policy_node *pn)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+        struct blkio_group *blkg;
+        struct hlist_node *n;
+        spin_lock(&blkio_list_lock);
        spin_lock_irq(&blkcg->lock);
-        while (!hlist_empty(&blkcg->blkg_list)) {
+        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
+                if (pn->dev != blkg->dev || pn->plid != blkg->plid)
-                                                struct blkcg_gq, blkcg_node);
+                        continue;
-                struct request_queue *q = blkg->q;
+                blkio_update_blkg_policy(blkcg, blkg, pn);
-                if (spin_trylock(q->queue_lock)) {
-                        blkg_destroy(blkg);
-                        spin_unlock(q->queue_lock);
-                } else {
-                        spin_unlock_irq(&blkcg->lock);
-                        cpu_relax();
-                        spin_lock_irq(&blkcg->lock);
-                }
        }
        spin_unlock_irq(&blkcg->lock);
+        spin_unlock(&blkio_list_lock);
 }
-static void blkcg_css_free(struct cgroup *cgroup)
+static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
+                                       const char *buffer)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+        int ret = 0;
+        char *buf;
+        struct blkio_policy_node *newpn, *pn;
+        struct blkio_cgroup *blkcg;
+        int keep_newpn = 0;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int fileid = BLKIOFILE_ATTR(cft->private);
+        buf = kstrdup(buffer, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
-        if (blkcg != &blkcg_root)
+        newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
-                kfree(blkcg);
+        if (!newpn) {
-}
+                ret = -ENOMEM;
+                goto free_buf;
+        }
-static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
+        ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
-{
+        if (ret)
-        static atomic64_t id_seq = ATOMIC64_INIT(0);
+                goto free_newpn;
-        struct blkcg *blkcg;
-        struct cgroup *parent = cgroup->parent;
-        if (!parent) {
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
-                blkcg = &blkcg_root;
-                goto done;
+        spin_lock_irq(&blkcg->lock);
+        pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
+        if (!pn) {
+                if (!blkio_delete_rule_command(newpn)) {
+                        blkio_policy_insert_node(blkcg, newpn);
+                        keep_newpn = 1;
+                }
+                spin_unlock_irq(&blkcg->lock);
+                goto update_io_group;
        }
-        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+        if (blkio_delete_rule_command(newpn)) {
-        if (!blkcg)
+                blkio_policy_delete_node(pn);
-                return ERR_PTR(-ENOMEM);
+                spin_unlock_irq(&blkcg->lock);
+                goto update_io_group;
+        }
+        spin_unlock_irq(&blkcg->lock);
-        blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
+        blkio_update_policy_rule(pn, newpn);
-        blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
-done:
-        spin_lock_init(&blkcg->lock);
-        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
-        INIT_HLIST_HEAD(&blkcg->blkg_list);
-        return &blkcg->css;
+update_io_group:
+        blkio_update_policy_node_blkg(blkcg, newpn);
+free_newpn:
+        if (!keep_newpn)
+                kfree(newpn);
+free_buf:
+        kfree(buf);
+        return ret;
 }
-/**
+static void
- * blkcg_init_queue - initialize blkcg part of request queue
+blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
- * @q: request_queue to initialize
- *
- * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
- * part of new request_queue @q.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int blkcg_init_queue(struct request_queue *q)
 {
-        might_sleep();
+        switch(pn->plid) {
+                case BLKIO_POLICY_PROP:
-        return blk_throtl_init(q);
+                        if (pn->fileid == BLKIO_PROP_weight_device)
+                                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                        MINOR(pn->dev), pn->val.weight);
+                        break;
+                case BLKIO_POLICY_THROTL:
+                        switch(pn->fileid) {
+                        case BLKIO_THROTL_read_bps_device:
+                        case BLKIO_THROTL_write_bps_device:
+                                seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+                                        MINOR(pn->dev), pn->val.bps);
+                                break;
+                        case BLKIO_THROTL_read_iops_device:
+                        case BLKIO_THROTL_write_iops_device:
+                                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                        MINOR(pn->dev), pn->val.iops);
+                                break;
+                        }
+                        break;
+                default:
+                        BUG();
+        }
 }
-/**
+/* cgroup files which read their data from policy nodes end up here */
- * blkcg_drain_queue - drain blkcg part of request_queue
+static void blkio_read_policy_node_files(struct cftype *cft,
- * @q: request_queue to drain
+                        struct blkio_cgroup *blkcg, struct seq_file *m)
- *
- * Called from blk_drain_queue().  Responsible for draining blkcg part.
- */
-void blkcg_drain_queue(struct request_queue *q)
 {
-        lockdep_assert_held(q->queue_lock);
+        struct blkio_policy_node *pn;
-        blk_throtl_drain(q);
+        if (!list_empty(&blkcg->policy_list)) {
+                spin_lock_irq(&blkcg->lock);
+                list_for_each_entry(pn, &blkcg->policy_list, node) {
+                        if (!pn_matches_cftype(cft, pn))
+                                continue;
+                        blkio_print_policy_node(m, pn);
+                }
+                spin_unlock_irq(&blkcg->lock);
+        }
 }
-/**
+static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
- * blkcg_exit_queue - exit and release blkcg part of request_queue
+                                struct seq_file *m)
- * @q: request_queue being released
- *
- * Called from blk_release_queue().  Responsible for exiting blkcg part.
- */
-void blkcg_exit_queue(struct request_queue *q)
 {
-        spin_lock_irq(q->queue_lock);
+        struct blkio_cgroup *blkcg;
-        blkg_destroy_all(q);
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-        spin_unlock_irq(q->queue_lock);
+        int name = BLKIOFILE_ATTR(cft->private);
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        switch(plid) {
+        case BLKIO_POLICY_PROP:
+                switch(name) {
+                case BLKIO_PROP_weight_device:
+                        blkio_read_policy_node_files(cft, blkcg, m);
+                        return 0;
+                default:
+                        BUG();
+                }
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(name){
+                case BLKIO_THROTL_read_bps_device:
+                case BLKIO_THROTL_write_bps_device:
+                case BLKIO_THROTL_read_iops_device:
+                case BLKIO_THROTL_write_iops_device:
+                        blkio_read_policy_node_files(cft, blkcg, m);
+                        return 0;
+                default:
+                        BUG();
+                }
+                break;
+        default:
+                BUG();
+        }
-        blk_throtl_exit(q);
+        return 0;
 }
-/*
+static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
- * We cannot support shared io contexts, as we have no mean to support
+                struct cftype *cft, struct cgroup_map_cb *cb,
- * two tasks with the same ioc in two different groups without major rework
+                enum stat_type type, bool show_total, bool pcpu)
- * of the main cic data structures.  For now we allow a task to change
- * its cgroup only if it's the only owner of its ioc.
- */
-static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
-        struct task_struct *task;
+        struct blkio_group *blkg;
-        struct io_context *ioc;
+        struct hlist_node *n;
-        int ret = 0;
+        uint64_t cgroup_total = 0;
-        /* task_lock() is needed to avoid races with exit_io_context() */
+        rcu_read_lock();
-        cgroup_taskset_for_each(task, cgrp, tset) {
+        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
-                task_lock(task);
+                if (blkg->dev) {
-                ioc = task->io_context;
+                        if (!cftype_blkg_same_policy(cft, blkg))
-                if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+                                continue;
-                        ret = -EINVAL;
+                        if (pcpu)
-                task_unlock(task);
+                                cgroup_total += blkio_get_stat_cpu(blkg, cb,
-                if (ret)
+                                                blkg->dev, type);
-                        break;
+                        else {
+                                spin_lock_irq(&blkg->stats_lock);
+                                cgroup_total += blkio_get_stat(blkg, cb,
+                                                blkg->dev, type);
+                                spin_unlock_irq(&blkg->stats_lock);
+                        }
+                }
        }
-        return ret;
+        if (show_total)
+                cb->fill(cb, "Total", cgroup_total);
+        rcu_read_unlock();
+        return 0;
 }
-struct cgroup_subsys blkio_subsys = {
+/* All map kind of cgroup file get serviced by this function */
-        .name = "blkio",
+static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
-        .css_alloc = blkcg_css_alloc,
+                                struct cgroup_map_cb *cb)
-        .css_offline = blkcg_css_offline,
+{
-        .css_free = blkcg_css_free,
+        struct blkio_cgroup *blkcg;
-        .can_attach = blkcg_can_attach,
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-        .subsys_id = blkio_subsys_id,
+        int name = BLKIOFILE_ATTR(cft->private);
-        .base_cftypes = blkcg_files,
-        .module = THIS_MODULE,
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        switch(plid) {
+        case BLKIO_POLICY_PROP:
+                switch(name) {
+                case BLKIO_PROP_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_TIME, 0, 0);
+                case BLKIO_PROP_sectors:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SECTORS, 0, 1);
+                case BLKIO_PROP_io_service_bytes:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
+                case BLKIO_PROP_io_serviced:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SERVICED, 1, 1);
+                case BLKIO_PROP_io_service_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_SERVICE_TIME, 1, 0);
+                case BLKIO_PROP_io_wait_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_WAIT_TIME, 1, 0);
+                case BLKIO_PROP_io_merged:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_MERGED, 1, 1);
+                case BLKIO_PROP_io_queued:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_QUEUED, 1, 0);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+                case BLKIO_PROP_unaccounted_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
+                case BLKIO_PROP_dequeue:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_DEQUEUE, 0, 0);
+                case BLKIO_PROP_avg_queue_size:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
+                case BLKIO_PROP_group_wait_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                        BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
+                case BLKIO_PROP_idle_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_IDLE_TIME, 0, 0);
+                case BLKIO_PROP_empty_time:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_EMPTY_TIME, 0, 0);
+#endif
+                default:
+                        BUG();
+                }
+                break;
+        case BLKIO_POLICY_THROTL:
+                switch(name){
+                case BLKIO_THROTL_io_service_bytes:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
+                case BLKIO_THROTL_io_serviced:
+                        return blkio_read_blkg_stats(blkcg, cft, cb,
+                                                BLKIO_STAT_CPU_SERVICED, 1, 1);
+                default:
+                        BUG();
+                }
+                break;
+        default:
+                BUG();
+        }
-        /*
+        return 0;
-         * blkio subsystem is utterly broken in terms of hierarchy support.
+}
-         * It treats all cgroups equally regardless of where they're
-         * located in the hierarchy - all cgroups are treated as if they're
-         * right below the root.  Fix it and remove the following.
-         */
-        .broken_hierarchy = true,
-};
-EXPORT_SYMBOL_GPL(blkio_subsys);
-/**
+static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
- * blkcg_activate_policy - activate a blkcg policy on a request_queue
- * @q: request_queue of interest
- * @pol: blkcg policy to activate
- *
- * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
- * bypass mode to populate its blkgs with policy_data for @pol.
- *
- * Activation happens with @q bypassed, so nobody would be accessing blkgs
- * from IO path.  Update of each blkg is protected by both queue and blkcg
- * locks so that holding either lock and testing blkcg_policy_enabled() is
- * always enough for dereferencing policy data.
- *
- * The caller is responsible for synchronizing [de]activations and policy
- * [un]registerations.  Returns 0 on success, -errno on failure.
- */
-int blkcg_activate_policy(struct request_queue *q,
-                          const struct blkcg_policy *pol)
 {
-        LIST_HEAD(pds);
+        struct blkio_group *blkg;
-        struct blkcg_gq *blkg;
+        struct hlist_node *n;
-        struct blkg_policy_data *pd, *n;
+        struct blkio_policy_node *pn;
-        int cnt = 0, ret;
-        bool preloaded;
-        if (blkcg_policy_enabled(q, pol))
-                return 0;
-        /* preallocations for root blkg */
+        if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
-        blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+                return -EINVAL;
-        if (!blkg)
-                return -ENOMEM;
-        preloaded = !radix_tree_preload(GFP_KERNEL);
+        spin_lock(&blkio_list_lock);
+        spin_lock_irq(&blkcg->lock);
+        blkcg->weight = (unsigned int)val;
-        blk_queue_bypass_start(q);
+        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                pn = blkio_policy_search_node(blkcg, blkg->dev,
+                                BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
+                if (pn)
+                        continue;
-        /* make sure the root blkg exists and count the existing blkgs */
+                blkio_update_group_weight(blkg, blkcg->weight);
-        spin_lock_irq(q->queue_lock);
+        }
+        spin_unlock_irq(&blkcg->lock);
+        spin_unlock(&blkio_list_lock);
+        return 0;
+}
-        rcu_read_lock();
+static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
-        blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
+        struct blkio_cgroup *blkcg;
-        rcu_read_unlock();
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int name = BLKIOFILE_ATTR(cft->private);
-        if (preloaded)
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
-                radix_tree_preload_end();
-        if (IS_ERR(blkg)) {
+        switch(plid) {
-                ret = PTR_ERR(blkg);
+        case BLKIO_POLICY_PROP:
-                goto out_unlock;
+                switch(name) {
+                case BLKIO_PROP_weight:
+                        return (u64)blkcg->weight;
+                }
+                break;
+        default:
+                BUG();
        }
-        q->root_blkg = blkg;
+        return 0;
-        q->root_rl.blkg = blkg;
+}
-        list_for_each_entry(blkg, &q->blkg_list, q_node)
+static int
-                cnt++;
+blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+        struct blkio_cgroup *blkcg;
+        enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+        int name = BLKIOFILE_ATTR(cft->private);
-        spin_unlock_irq(q->queue_lock);
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
-        /* allocate policy_data for all existing blkgs */
+        switch(plid) {
-        while (cnt--) {
+        case BLKIO_POLICY_PROP:
-                pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
+                switch(name) {
-                if (!pd) {
+                case BLKIO_PROP_weight:
-                        ret = -ENOMEM;
+                        return blkio_weight_write(blkcg, val);
-                        goto out_free;
                }
-                list_add_tail(&pd->alloc_node, &pds);
+                break;
+        default:
+                BUG();
        }
-        /*
+        return 0;
-         * Install the allocated pds.  With @q bypassing, no new blkg
+}
-         * should have been created while the queue lock was dropped.
-         */
-        spin_lock_irq(q->queue_lock);
-        list_for_each_entry(blkg, &q->blkg_list, q_node) {
+struct cftype blkio_files[] = {
-                if (WARN_ON(list_empty(&pds))) {
+        {
-                        /* umm... this shouldn't happen, just abort */
+                .name = "weight_device",
-                        ret = -ENOMEM;
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-                        goto out_unlock;
+                                BLKIO_PROP_weight_device),
-                }
+                .read_seq_string = blkiocg_file_read,
-                pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
+                .write_string = blkiocg_file_write,
-                list_del_init(&pd->alloc_node);
+                .max_write_len = 256,
+        },
+        {
+                .name = "weight",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_weight),
+                .read_u64 = blkiocg_file_read_u64,
+                .write_u64 = blkiocg_file_write_u64,
+        },
+        {
+                .name = "time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_time),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "sectors",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_sectors),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "io_service_bytes",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_service_bytes),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "io_serviced",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_serviced),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "io_service_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_service_time),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "io_wait_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_wait_time),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "io_merged",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_merged),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "io_queued",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_io_queued),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "reset_stats",
+                .write_u64 = blkiocg_reset_stats,
+        },
+#ifdef CONFIG_BLK_DEV_THROTTLING
+        {
+                .name = "throttle.read_bps_device",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_read_bps_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
-                /* grab blkcg lock too while installing @pd on @blkg */
+        {
-                spin_lock(&blkg->blkcg->lock);
+                .name = "throttle.write_bps_device",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_write_bps_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
-                blkg->pd[pol->plid] = pd;
+        {
-                pd->blkg = blkg;
+                .name = "throttle.read_iops_device",
-                pol->pd_init_fn(blkg);
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_read_iops_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
-                spin_unlock(&blkg->blkcg->lock);
+        {
-        }
+                .name = "throttle.write_iops_device",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_write_iops_device),
+                .read_seq_string = blkiocg_file_read,
+                .write_string = blkiocg_file_write,
+                .max_write_len = 256,
+        },
+        {
+                .name = "throttle.io_service_bytes",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_io_service_bytes),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "throttle.io_serviced",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                                BLKIO_THROTL_io_serviced),
+                .read_map = blkiocg_file_read_map,
+        },
+#endif /* CONFIG_BLK_DEV_THROTTLING */
-        __set_bit(pol->plid, q->blkcg_pols);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
-        ret = 0;
+        {
-out_unlock:
+                .name = "avg_queue_size",
-        spin_unlock_irq(q->queue_lock);
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-out_free:
+                                BLKIO_PROP_avg_queue_size),
-        blk_queue_bypass_end(q);
+                .read_map = blkiocg_file_read_map,
-        list_for_each_entry_safe(pd, n, &pds, alloc_node)
+        },
-                kfree(pd);
+        {
-        return ret;
+                .name = "group_wait_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_group_wait_time),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "idle_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_idle_time),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "empty_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_empty_time),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "dequeue",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_dequeue),
+                .read_map = blkiocg_file_read_map,
+        },
+        {
+                .name = "unaccounted_time",
+                .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                                BLKIO_PROP_unaccounted_time),
+                .read_map = blkiocg_file_read_map,
+        },
+#endif
+};
+static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+        return cgroup_add_files(cgroup, subsys, blkio_files,
+                                ARRAY_SIZE(blkio_files));
 }
-EXPORT_SYMBOL_GPL(blkcg_activate_policy);
-/**
+static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
- * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
- * @q: request_queue of interest
- * @pol: blkcg policy to deactivate
- *
- * Deactivate @pol on @q.  Follows the same synchronization rules as
- * blkcg_activate_policy().
- */
-void blkcg_deactivate_policy(struct request_queue *q,
-                             const struct blkcg_policy *pol)
 {
-        struct blkcg_gq *blkg;
+        struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+        unsigned long flags;
+        struct blkio_group *blkg;
+        void *key;
+        struct blkio_policy_type *blkiop;
+        struct blkio_policy_node *pn, *pntmp;
-        if (!blkcg_policy_enabled(q, pol))
+        rcu_read_lock();
-                return;
+        do {
+                spin_lock_irqsave(&blkcg->lock, flags);
-        blk_queue_bypass_start(q);
+                if (hlist_empty(&blkcg->blkg_list)) {
-        spin_lock_irq(q->queue_lock);
+                        spin_unlock_irqrestore(&blkcg->lock, flags);
+                        break;
+                }
-        __clear_bit(pol->plid, q->blkcg_pols);
+                blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+                                        blkcg_node);
+                key = rcu_dereference(blkg->key);
+                __blkiocg_del_blkio_group(blkg);
-        /* if no policy is left, no need for blkgs - shoot them down */
+                spin_unlock_irqrestore(&blkcg->lock, flags);
-        if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
-                blkg_destroy_all(q);
-        list_for_each_entry(blkg, &q->blkg_list, q_node) {
+                /*
-                /* grab blkcg lock too while removing @pd from @blkg */
+                 * This blkio_group is being unlinked as associated cgroup is
-                spin_lock(&blkg->blkcg->lock);
+                 * going away. Let all the IO controlling policies know about
+                 * this event.
+                 */
+                spin_lock(&blkio_list_lock);
+                list_for_each_entry(blkiop, &blkio_list, list) {
+                        if (blkiop->plid != blkg->plid)
+                                continue;
+                        blkiop->ops.blkio_unlink_group_fn(key, blkg);
+                }
+                spin_unlock(&blkio_list_lock);
+        } while (1);
-                if (pol->pd_exit_fn)
+        list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
-                        pol->pd_exit_fn(blkg);
+                blkio_policy_delete_node(pn);
+                kfree(pn);
+        }
-                kfree(blkg->pd[pol->plid]);
+        free_css_id(&blkio_subsys, &blkcg->css);
-                blkg->pd[pol->plid] = NULL;
+        rcu_read_unlock();
+        if (blkcg != &blkio_root_cgroup)
+                kfree(blkcg);
+}
-                spin_unlock(&blkg->blkcg->lock);
+static struct cgroup_subsys_state *
+blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+        struct blkio_cgroup *blkcg;
+        struct cgroup *parent = cgroup->parent;
+        if (!parent) {
+                blkcg = &blkio_root_cgroup;
+                goto done;
        }
-        spin_unlock_irq(q->queue_lock);
+        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-        blk_queue_bypass_end(q);
+        if (!blkcg)
+                return ERR_PTR(-ENOMEM);
+        blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+done:
+        spin_lock_init(&blkcg->lock);
+        INIT_HLIST_HEAD(&blkcg->blkg_list);
+        INIT_LIST_HEAD(&blkcg->policy_list);
+        return &blkcg->css;
 }
-EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
-/**
+/*
- * blkcg_policy_register - register a blkcg policy
+ * We cannot support shared io contexts, as we have no mean to support
- * @pol: blkcg policy to register
+ * two tasks with the same ioc in two different groups without major rework
- *
+ * of the main cic data structures.  For now we allow a task to change
- * Register @pol with blkcg core.  Might sleep and @pol may be modified on
+ * its cgroup only if it's the only owner of its ioc.
- * successful registration.  Returns 0 on success and -errno on failure.
 */
-int blkcg_policy_register(struct blkcg_policy *pol)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int i, ret;
+        struct io_context *ioc;
+        int ret = 0;
-        if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-                return -EINVAL;
-        mutex_lock(&blkcg_pol_mutex);
+        /* task_lock() is needed to avoid races with exit_io_context() */
+        task_lock(tsk);
+        ioc = tsk->io_context;
+        if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+                ret = -EINVAL;
+        task_unlock(tsk);
-        /* find an empty slot */
-        ret = -ENOSPC;
-        for (i = 0; i < BLKCG_MAX_POLS; i++)
-                if (!blkcg_policy[i])
-                        break;
-        if (i >= BLKCG_MAX_POLS)
-                goto out_unlock;
-        /* register and update blkgs */
-        pol->plid = i;
-        blkcg_policy[i] = pol;
-        /* everything is in place, add intf files for the new policy */
-        if (pol->cftypes)
-                WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
-        ret = 0;
-out_unlock:
-        mutex_unlock(&blkcg_pol_mutex);
        return ret;
 }
-EXPORT_SYMBOL_GPL(blkcg_policy_register);
-/**
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
- * blkcg_policy_unregister - unregister a blkcg policy
+{
- * @pol: blkcg policy to unregister
+        struct io_context *ioc;
- *
- * Undo blkcg_policy_register(@pol).  Might sleep.
+        task_lock(tsk);
- */
+        ioc = tsk->io_context;
-void blkcg_policy_unregister(struct blkcg_policy *pol)
+        if (ioc)
+                ioc->cgroup_changed = 1;
+        task_unlock(tsk);
+}
+void blkio_policy_register(struct blkio_policy_type *blkiop)
 {
-        mutex_lock(&blkcg_pol_mutex);
+        spin_lock(&blkio_list_lock);
+        list_add_tail(&blkiop->list, &blkio_list);
+        spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_register);
-        if (WARN_ON(blkcg_policy[pol->plid] != pol))
+void blkio_policy_unregister(struct blkio_policy_type *blkiop)
-                goto out_unlock;
+{
+        spin_lock(&blkio_list_lock);
+        list_del_init(&blkiop->list);
+        spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_unregister);
-        /* kill the intf files first */
+static int __init init_cgroup_blkio(void)
-        if (pol->cftypes)
+{
-                cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
+        return cgroup_load_subsys(&blkio_subsys);
+}
-        /* unregister and update blkgs */
+static void __exit exit_cgroup_blkio(void)
-        blkcg_policy[pol->plid] = NULL;
+{
-out_unlock:
+        cgroup_unload_subsys(&blkio_subsys);
-        mutex_unlock(&blkcg_pol_mutex);
 }
-EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+module_init(init_cgroup_blkio);
+module_exit(exit_cgroup_blkio);
+MODULE_LICENSE("GPL");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 24597309e23..a71d2904ffb 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,491 +15,350 @@
 #include <linux/cgroup.h>
 #include <linux/u64_stats_sync.h>
-#include <linux/seq_file.h>
-#include <linux/radix-tree.h>
+enum blkio_policy_id {
-#include <linux/blkdev.h>
+        BLKIO_POLICY_PROP = 0,          /* Proportional Bandwidth division */
+        BLKIO_POLICY_THROTL,            /* Throttling */
+};
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX         UINT_MAX
-/* CFQ specific, out here for blkcg->cfq_weight */
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-#define CFQ_WEIGHT_MIN          10
-#define CFQ_WEIGHT_MAX          1000
+#ifndef CONFIG_BLK_CGROUP
-#define CFQ_WEIGHT_DEFAULT      500
+/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */
+extern struct cgroup_subsys blkio_subsys;
-#ifdef CONFIG_BLK_CGROUP
+#define blkio_subsys_id blkio_subsys.subsys_id
+#endif
-enum blkg_rwstat_type {
-        BLKG_RWSTAT_READ,
+enum stat_type {
-        BLKG_RWSTAT_WRITE,
+        /* Total time spent (in ns) between request dispatch to the driver and
-        BLKG_RWSTAT_SYNC,
+         * request completion for IOs doen by this cgroup. This may not be
-        BLKG_RWSTAT_ASYNC,
+         * accurate when NCQ is turned on. */
+        BLKIO_STAT_SERVICE_TIME = 0,
-        BLKG_RWSTAT_NR,
+        /* Total time spent waiting in scheduler queue in ns */
-        BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+        BLKIO_STAT_WAIT_TIME,
+        /* Number of IOs queued up */
+        BLKIO_STAT_QUEUED,
+        /* All the single valued stats go below this */
+        BLKIO_STAT_TIME,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        /* Time not charged to this cgroup */
+        BLKIO_STAT_UNACCOUNTED_TIME,
+        BLKIO_STAT_AVG_QUEUE_SIZE,
+        BLKIO_STAT_IDLE_TIME,
+        BLKIO_STAT_EMPTY_TIME,
+        BLKIO_STAT_GROUP_WAIT_TIME,
+        BLKIO_STAT_DEQUEUE
+#endif
 };
-struct blkcg_gq;
+/* Per cpu stats */
+enum stat_type_cpu {
-struct blkcg {
+        BLKIO_STAT_CPU_SECTORS,
-        struct cgroup_subsys_state      css;
+        /* Total bytes transferred */
-        spinlock_t                      lock;
+        BLKIO_STAT_CPU_SERVICE_BYTES,
+        /* Total IOs serviced, post merge */
-        struct radix_tree_root          blkg_tree;
+        BLKIO_STAT_CPU_SERVICED,
-        struct blkcg_gq                 *blkg_hint;
+        /* Number of IOs merged */
-        struct hlist_head               blkg_list;
+        BLKIO_STAT_CPU_MERGED,
+        BLKIO_STAT_CPU_NR
-        /* for policies to test whether associated blkcg has changed */
-        uint64_t                        id;
-        /* TODO: per-policy storage in blkcg */
-        unsigned int                    cfq_weight;     /* belongs to cfq */
 };
-struct blkg_stat {
+enum stat_sub_type {
-        struct u64_stats_sync           syncp;
+        BLKIO_STAT_READ = 0,
-        uint64_t                        cnt;
+        BLKIO_STAT_WRITE,
+        BLKIO_STAT_SYNC,
+        BLKIO_STAT_ASYNC,
+        BLKIO_STAT_TOTAL
 };
-struct blkg_rwstat {
+/* blkg state flags */
-        struct u64_stats_sync           syncp;
+enum blkg_state_flags {
-        uint64_t                        cnt[BLKG_RWSTAT_NR];
+        BLKG_waiting = 0,
+        BLKG_idling,
+        BLKG_empty,
 };
-/*
+/* cgroup files owned by proportional weight policy */
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+enum blkcg_file_name_prop {
- * request_queue (q).  This is used by blkcg policies which need to track
+        BLKIO_PROP_weight = 1,
- * information per blkcg - q pair.
+        BLKIO_PROP_weight_device,
- *
+        BLKIO_PROP_io_service_bytes,
- * There can be multiple active blkcg policies and each has its private
+        BLKIO_PROP_io_serviced,
- * data on each blkg, the size of which is determined by
+        BLKIO_PROP_time,
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+        BLKIO_PROP_sectors,
- * together with blkg and invokes pd_init/exit_fn() methods.
+        BLKIO_PROP_unaccounted_time,
- *
+        BLKIO_PROP_io_service_time,
- * Such private data must embed struct blkg_policy_data (pd) at the
+        BLKIO_PROP_io_wait_time,
- * beginning and pd_size can't be smaller than pd.
+        BLKIO_PROP_io_merged,
- */
+        BLKIO_PROP_io_queued,
-struct blkg_policy_data {
+        BLKIO_PROP_avg_queue_size,
-        /* the blkg this per-policy data belongs to */
+        BLKIO_PROP_group_wait_time,
-        struct blkcg_gq                 *blkg;
+        BLKIO_PROP_idle_time,
+        BLKIO_PROP_empty_time,
-        /* used during policy activation */
+        BLKIO_PROP_dequeue,
-        struct list_head                alloc_node;
 };
-/* association between a blk cgroup and a request queue */
+/* cgroup files owned by throttle policy */
-struct blkcg_gq {
+enum blkcg_file_name_throtl {
-        /* Pointer to the associated request_queue */
+        BLKIO_THROTL_read_bps_device,
-        struct request_queue            *q;
+        BLKIO_THROTL_write_bps_device,
-        struct list_head                q_node;
+        BLKIO_THROTL_read_iops_device,
-        struct hlist_node               blkcg_node;
+        BLKIO_THROTL_write_iops_device,
-        struct blkcg                    *blkcg;
+        BLKIO_THROTL_io_service_bytes,
-        /* request allocation list for this blkcg-q pair */
+        BLKIO_THROTL_io_serviced,
-        struct request_list             rl;
-        /* reference count */
-        int                             refcnt;
-        struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
-        struct rcu_head                 rcu_head;
 };
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+struct blkio_cgroup {
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+        struct cgroup_subsys_state css;
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+        unsigned int weight;
+        spinlock_t lock;
-struct blkcg_policy {
+        struct hlist_head blkg_list;
-        int                             plid;
+        struct list_head policy_list; /* list of blkio_policy_node */
-        /* policy specific private data size */
-        size_t                          pd_size;
-        /* cgroup files for the policy */
-        struct cftype                   *cftypes;
-        /* operations */
-        blkcg_pol_init_pd_fn            *pd_init_fn;
-        blkcg_pol_exit_pd_fn            *pd_exit_fn;
-        blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
 };
-extern struct blkcg blkcg_root;
+struct blkio_group_stats {
+        /* total disk time and nr sectors dispatched by this group */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+        uint64_t time;
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+        uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
-                                    struct request_queue *q);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
-int blkcg_init_queue(struct request_queue *q);
+        /* Time not charged to this cgroup */
-void blkcg_drain_queue(struct request_queue *q);
+        uint64_t unaccounted_time;
-void blkcg_exit_queue(struct request_queue *q);
+        /* Sum of number of IOs queued across all samples */
-/* Blkio controller policy registration */
+        uint64_t avg_queue_size_sum;
-int blkcg_policy_register(struct blkcg_policy *pol);
+        /* Count of samples taken for average */
-void blkcg_policy_unregister(struct blkcg_policy *pol);
+        uint64_t avg_queue_size_samples;
-int blkcg_activate_policy(struct request_queue *q,
+        /* How many times this group has been removed from service tree */
-                          const struct blkcg_policy *pol);
+        unsigned long dequeue;
-void blkcg_deactivate_policy(struct request_queue *q,
-                             const struct blkcg_policy *pol);
+        /* Total time spent waiting for it to be assigned a timeslice. */
+        uint64_t group_wait_time;
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+        uint64_t start_group_wait_time;
-                       u64 (*prfill)(struct seq_file *,
-                                     struct blkg_policy_data *, int),
+        /* Time spent idling for this blkio_group */
-                       const struct blkcg_policy *pol, int data,
+        uint64_t idle_time;
-                       bool show_total);
+        uint64_t start_idle_time;
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-                         const struct blkg_rwstat *rwstat);
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-                       int off);
-struct blkg_conf_ctx {
-        struct gendisk                  *disk;
-        struct blkcg_gq                 *blkg;
-        u64                             v;
-};
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                   const char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
-{
-        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
-                            struct blkcg, css);
-}
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-        return container_of(task_subsys_state(tsk, blkio_subsys_id),
-                            struct blkcg, css);
-}
-static inline struct blkcg *bio_blkcg(struct bio *bio)
-{
-        if (bio && bio->bi_css)
-                return container_of(bio->bi_css, struct blkcg, css);
-        return task_blkcg(current);
-}
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-                                                  struct blkcg_policy *pol)
-{
-        return blkg ? blkg->pd[pol->plid] : NULL;
-}
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data.  Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
-        return pd ? pd->blkg : NULL;
-}
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
-        int ret;
-        rcu_read_lock();
-        ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-        rcu_read_unlock();
-        if (ret)
-                strncpy(buf, "<unavailable>", buflen);
-        return ret;
-}
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding queue_lock and an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
-        lockdep_assert_held(blkg->q->queue_lock);
-        WARN_ON_ONCE(!blkg->refcnt);
-        blkg->refcnt++;
-}
-void __blkg_release(struct blkcg_gq *blkg);
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- *
- * The caller should be holding queue_lock.
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
-        lockdep_assert_held(blkg->q->queue_lock);
-        WARN_ON_ONCE(blkg->refcnt <= 0);
-        if (!--blkg->refcnt)
-                __blkg_release(blkg);
-}
-/**
- * blk_get_rl - get request_list to use
- * @q: request_queue of interest
- * @bio: bio which will be attached to the allocated request (may be %NULL)
- *
- * The caller wants to allocate a request from @q to use for @bio.  Find
- * the request_list to use and obtain a reference on it.  Should be called
- * under queue_lock.  This function is guaranteed to return non-%NULL
- * request_list.
- */
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-                                              struct bio *bio)
-{
-        struct blkcg *blkcg;
-        struct blkcg_gq *blkg;
-        rcu_read_lock();
-        blkcg = bio_blkcg(bio);
-        /* bypass blkg lookup and use @q->root_rl directly for root */
-        if (blkcg == &blkcg_root)
-                goto root_rl;
        /*
-         * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+         * Total time when we have requests queued and do not contain the
-         * or if either the blkcg or queue is going away.  Fall back to
+         * current active queue.
-         * root_rl in such cases.
         */
-        blkg = blkg_lookup_create(blkcg, q);
+        uint64_t empty_time;
-        if (unlikely(IS_ERR(blkg)))
+        uint64_t start_empty_time;
-                goto root_rl;
+        uint16_t flags;
+#endif
-        blkg_get(blkg);
+};
-        rcu_read_unlock();
-        return &blkg->rl;
-root_rl:
-        rcu_read_unlock();
-        return &q->root_rl;
-}
-/**
- * blk_put_rl - put request_list
- * @rl: request_list to put
- *
- * Put the reference acquired by blk_get_rl().  Should be called under
- * queue_lock.
- */
-static inline void blk_put_rl(struct request_list *rl)
-{
-        /* root_rl may not have blkg set */
-        if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
-                blkg_put(rl->blkg);
-}
-/**
- * blk_rq_set_rl - associate a request with a request_list
- * @rq: request of interest
- * @rl: target request_list
- *
- * Associate @rq with @rl so that accounting and freeing can know the
- * request_list @rq came from.
- */
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
-{
-        rq->rl = rl;
-}
-/**
- * blk_rq_rl - return the request_list a request came from
- * @rq: request of interest
- *
- * Return the request_list @rq is allocated from.
- */
-static inline struct request_list *blk_rq_rl(struct request *rq)
-{
-        return rq->rl;
-}
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
-                                         struct request_queue *q);
-/**
- * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
- *
- * Should be used under queue_lock.
- */
-#define blk_queue_for_each_rl(rl, q)    \
-        for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-/**
- * blkg_stat_add - add a value to a blkg_stat
- * @stat: target blkg_stat
- * @val: value to add
- *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
- */
-static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
-{
-        u64_stats_update_begin(&stat->syncp);
-        stat->cnt += val;
-        u64_stats_update_end(&stat->syncp);
-}
-/**
- * blkg_stat_read - read the current value of a blkg_stat
- * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
- */
-static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
-{
-        unsigned int start;
-        uint64_t v;
-        do {
-                start = u64_stats_fetch_begin(&stat->syncp);
-                v = stat->cnt;
-        } while (u64_stats_fetch_retry(&stat->syncp, start));
-        return v;
-}
-/**
- * blkg_stat_reset - reset a blkg_stat
- * @stat: blkg_stat to reset
- */
-static inline void blkg_stat_reset(struct blkg_stat *stat)
-{
-        stat->cnt = 0;
-}
-/**
+/* Per cpu blkio group stats */
- * blkg_rwstat_add - add a value to a blkg_rwstat
+struct blkio_group_stats_cpu {
- * @rwstat: target blkg_rwstat
+        uint64_t sectors;
- * @rw: mask of REQ_{WRITE|SYNC}
+        uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
- * @val: value to add
+        struct u64_stats_sync syncp;
- *
+};
- * Add @val to @rwstat.  The counters are chosen according to @rw.  The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-                                   int rw, uint64_t val)
-{
-        u64_stats_update_begin(&rwstat->syncp);
-        if (rw & REQ_WRITE)
-                rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
-        else
-                rwstat->cnt[BLKG_RWSTAT_READ] += val;
-        if (rw & REQ_SYNC)
-                rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
-        else
-                rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
-        u64_stats_update_end(&rwstat->syncp);
-}
-/**
+struct blkio_group {
- * blkg_rwstat_read - read the current values of a blkg_rwstat
+        /* An rcu protected unique identifier for the group */
- * @rwstat: blkg_rwstat to read
+        void *key;
- *
+        struct hlist_node blkcg_node;
- * Read the current snapshot of @rwstat and return it as the return value.
+        unsigned short blkcg_id;
- * This function can be called without synchronization and takes care of
+        /* Store cgroup path */
- * u64 atomicity.
+        char path[128];
- */
+        /* The device MKDEV(major, minor), this group has been created for */
-static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+        dev_t dev;
-{
+        /* policy which owns this blk group */
-        unsigned int start;
+        enum blkio_policy_id plid;
-        struct blkg_rwstat tmp;
+        /* Need to serialize the stats in the case of reset/update */
+        spinlock_t stats_lock;
+        struct blkio_group_stats stats;
+        /* Per cpu stats pointer */
+        struct blkio_group_stats_cpu __percpu *stats_cpu;
+};
-        do {
+struct blkio_policy_node {
-                start = u64_stats_fetch_begin(&rwstat->syncp);
+        struct list_head node;
-                tmp = *rwstat;
+        dev_t dev;
-        } while (u64_stats_fetch_retry(&rwstat->syncp, start));
+        /* This node belongs to max bw policy or porportional weight policy */
+        enum blkio_policy_id plid;
+        /* cgroup file to which this rule belongs to */
+        int fileid;
+        union {
+                unsigned int weight;
+                /*
+                 * Rate read/write in terms of byptes per second
+                 * Whether this rate represents read or write is determined
+                 * by file type "fileid".
+                 */
+                u64 bps;
+                unsigned int iops;
+        } val;
+};
-        return tmp;
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
-}
+                                     dev_t dev);
+extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
+extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
+extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
+extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
+typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
+typedef void (blkio_update_group_weight_fn) (void *key,
+                        struct blkio_group *blkg, unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (void * key,
+                        struct blkio_group *blkg, u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (void *key,
+                        struct blkio_group *blkg, u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (void *key,
+                        struct blkio_group *blkg, unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (void *key,
+                        struct blkio_group *blkg, unsigned int write_iops);
+struct blkio_policy_ops {
+        blkio_unlink_group_fn *blkio_unlink_group_fn;
+        blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+        blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
+        blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
+        blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
+        blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
+};
-/**
+struct blkio_policy_type {
- * blkg_rwstat_sum - read the total count of a blkg_rwstat
+        struct list_head list;
- * @rwstat: blkg_rwstat to read
+        struct blkio_policy_ops ops;
- *
+        enum blkio_policy_id plid;
- * Return the total count of @rwstat regardless of the IO direction.  This
+};
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
-{
-        struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
-        return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+/* Blkio controller policy registration */
-}
+extern void blkio_policy_register(struct blkio_policy_type *);
+extern void blkio_policy_unregister(struct blkio_policy_type *);
-/**
+static inline char *blkg_path(struct blkio_group *blkg)
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
-        memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+        return blkg->path;
 }
-#else   /* CONFIG_BLK_CGROUP */
+#else
-struct cgroup;
-struct blkcg;
-struct blkg_policy_data {
+struct blkio_group {
 };
-struct blkcg_gq {
+struct blkio_policy_type {
 };
-struct blkcg_policy {
+static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
-};
+static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_drain_queue(struct request_queue *q) { }
+#endif
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+#define BLKIO_WEIGHT_MIN        10
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+#define BLKIO_WEIGHT_MAX        1000
-static inline int blkcg_activate_policy(struct request_queue *q,
+#define BLKIO_WEIGHT_DEFAULT    500
-                                        const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
-                                           const struct blkcg_policy *pol) { }
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
+                                unsigned long dequeue);
-static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+void blkiocg_set_start_empty_time(struct blkio_group *blkg);
-                                                  struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+#define BLKG_FLAG_FNS(name)                                             \
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkio_mark_blkg_##name(                              \
-static inline void blkg_get(struct blkcg_gq *blkg) { }
+                struct blkio_group_stats *stats)                        \
-static inline void blkg_put(struct blkcg_gq *blkg) { }
+{                                                                       \
+        stats->flags |= (1 << BLKG_##name);                             \
-static inline struct request_list *blk_get_rl(struct request_queue *q,
+}                                                                       \
-                                              struct bio *bio) { return &q->root_rl; }
+static inline void blkio_clear_blkg_##name(                             \
-static inline void blk_put_rl(struct request_list *rl) { }
+                struct blkio_group_stats *stats)                        \
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+{                                                                       \
-static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+        stats->flags &= ~(1 << BLKG_##name);                            \
+}                                                                       \
-#define blk_queue_for_each_rl(rl, q)    \
+static inline int blkio_blkg_##name(struct blkio_group_stats *stats)    \
-        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+{                                                                       \
+        return (stats->flags & (1 << BLKG_##name)) != 0;                \
-#endif  /* CONFIG_BLK_CGROUP */
+}                                                                       \
-#endif  /* _BLK_CGROUP_H */
+BLKG_FLAG_FNS(waiting)
+BLKG_FLAG_FNS(idling)
+BLKG_FLAG_FNS(empty)
+#undef BLKG_FLAG_FNS
+#else
+static inline void blkiocg_update_avg_queue_size_stats(
+                                                struct blkio_group *blkg) {}
+static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+                                                unsigned long dequeue) {}
+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{}
+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
+#endif
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+extern struct blkio_cgroup blkio_root_cgroup;
+extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
+extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+        struct blkio_group *blkg, void *key, dev_t dev,
+        enum blkio_policy_id plid);
+extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
+extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
+extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
+                                                void *key);
+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+                                        unsigned long time,
+                                        unsigned long unaccounted_time);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
+                                                bool direction, bool sync);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+        uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+                                        bool sync);
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+                struct blkio_group *curr_blkg, bool direction, bool sync);
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                        bool direction, bool sync);
+#else
+struct cgroup;
+static inline struct blkio_cgroup *
+cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+static inline struct blkio_cgroup *
+task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
+static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+                struct blkio_group *blkg, void *key, dev_t dev,
+                enum blkio_policy_id plid) {}
+static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
+static inline int
+blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
+static inline struct blkio_group *
+blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+                                                unsigned long time,
+                                                unsigned long unaccounted_time)
+{}
+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+                                uint64_t bytes, bool direction, bool sync) {}
+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
+                uint64_t start_time, uint64_t io_start_time, bool direction,
+                bool sync) {}
+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+                                                bool direction, bool sync) {}
+static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+                struct blkio_group *curr_blkg, bool direction, bool sync) {}
+static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                                bool direction, bool sync) {}
+#endif
+#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index c973249d68c..8fc4ae28a19 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,21 +28,17 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
-#include <linux/delay.h>
-#include <linux/ratelimit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 #include "blk.h"
-#include "blk-cgroup.h"
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
-EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
-DEFINE_IDA(blk_queue_ida);
+static int __make_request(struct request_queue *q, struct bio *bio);
 /*
 * For the allocated request tables
@@ -220,13 +216,12 @@ static void blk_delay_work(struct work_struct *work)
 * Description:
 *   Sometimes queueing needs to be postponed for a little while, to allow
 *   resources to come back. This function will make sure that queueing is
- *   restarted around the specified time. Queue lock must be held.
+ *   restarted around the specified time.
 */
 void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 {
-        if (likely(!blk_queue_dead(q)))
+        queue_delayed_work(kblockd_workqueue, &q->delay_work,
-                queue_delayed_work(kblockd_workqueue, &q->delay_work,
+                                msecs_to_jiffies(msecs));
-                                   msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_delay_queue);
@@ -264,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue);
 **/
 void blk_stop_queue(struct request_queue *q)
 {
-        cancel_delayed_work(&q->delay_work);
+        __cancel_delayed_work(&q->delay_work);
        queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
@@ -284,7 +279,7 @@ EXPORT_SYMBOL(blk_stop_queue);
 *
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevaotor_exit()
- *     and blkcg_exit_queue() to be called with queue lock initialized.
+ *     and blk_throtl_exit() to be called with queue lock initialized.
 *
 */
 void blk_sync_queue(struct request_queue *q)
@@ -295,34 +290,6 @@ void blk_sync_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_sync_queue);
 /**
- * __blk_run_queue_uncond - run a queue whether or not it has been stopped
- * @q:  The queue to run
- *
- * Description:
- *    Invoke request handling on a queue if there are any pending requests.
- *    May be used to restart request handling after a request has completed.
- *    This variant runs the queue whether or not the queue has been
- *    stopped. Must be called with the queue lock held and interrupts
- *    disabled. See also @blk_run_queue.
- */
-inline void __blk_run_queue_uncond(struct request_queue *q)
-{
-        if (unlikely(blk_queue_dead(q)))
-                return;
-        /*
-         * Some request_fn implementations, e.g. scsi_request_fn(), unlock
-         * the queue lock internally. As a result multiple threads may be
-         * running such a request function concurrently. Keep track of the
-         * number of active request_fn invocations such that blk_drain_queue()
-         * can wait until all these request_fn calls have finished.
-         */
-        q->request_fn_active++;
-        q->request_fn(q);
-        q->request_fn_active--;
-}
-/**
 * __blk_run_queue - run a single device queue
 * @q:  The queue to run
 *
@@ -335,7 +302,7 @@ void __blk_run_queue(struct request_queue *q)
        if (unlikely(blk_queue_stopped(q)))
                return;
-        __blk_run_queue_uncond(q);
+        q->request_fn(q);
 }
 EXPORT_SYMBOL(__blk_run_queue);
@@ -345,12 +312,14 @@ EXPORT_SYMBOL(__blk_run_queue);
 *
 * Description:
 *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
- *    of us. The caller must hold the queue lock.
+ *    of us.
 */
 void blk_run_queue_async(struct request_queue *q)
 {
-        if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
+        if (likely(!blk_queue_stopped(q))) {
-                mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+                __cancel_delayed_work(&q->delay_work);
+                queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+        }
 }
 EXPORT_SYMBOL(blk_run_queue_async);
@@ -378,219 +347,59 @@ void blk_put_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_put_queue);
-/**
+/*
- * __blk_drain_queue - drain requests from request_queue
+ * Note: If a driver supplied the queue lock, it is disconnected
- * @q: queue to drain
+ * by this function. The actual state of the lock doesn't matter
- * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
+ * here as the request_queue isn't accessible after this point
- *
+ * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
- * Drain requests from @q.  If @drain_all is set, all requests are drained.
- * If not, only ELVPRIV requests are drained.  The caller is responsible
- * for ensuring that no new requests which need to be drained are queued.
- */
-static void __blk_drain_queue(struct request_queue *q, bool drain_all)
-        __releases(q->queue_lock)
-        __acquires(q->queue_lock)
-{
-        int i;
-        lockdep_assert_held(q->queue_lock);
-        while (true) {
-                bool drain = false;
-                /*
-                 * The caller might be trying to drain @q before its
-                 * elevator is initialized.
-                 */
-                if (q->elevator)
-                        elv_drain_elevator(q);
-                blkcg_drain_queue(q);
-                /*
-                 * This function might be called on a queue which failed
-                 * driver init after queue creation or is not yet fully
-                 * active yet.  Some drivers (e.g. fd and loop) get unhappy
-                 * in such cases.  Kick queue iff dispatch queue has
-                 * something on it and @q has request_fn set.
-                 */
-                if (!list_empty(&q->queue_head) && q->request_fn)
-                        __blk_run_queue(q);
-                drain |= q->nr_rqs_elvpriv;
-                drain |= q->request_fn_active;
-                /*
-                 * Unfortunately, requests are queued at and tracked from
-                 * multiple places and there's no single counter which can
-                 * be drained.  Check all the queues and counters.
-                 */
-                if (drain_all) {
-                        drain |= !list_empty(&q->queue_head);
-                        for (i = 0; i < 2; i++) {
-                                drain |= q->nr_rqs[i];
-                                drain |= q->in_flight[i];
-                                drain |= !list_empty(&q->flush_queue[i]);
-                        }
-                }
-                if (!drain)
-                        break;
-                spin_unlock_irq(q->queue_lock);
-                msleep(10);
-                spin_lock_irq(q->queue_lock);
-        }
-        /*
-         * With queue marked dead, any woken up waiter will fail the
-         * allocation path, so the wakeup chaining is lost and we're
-         * left with hung waiters. We need to wake up those waiters.
-         */
-        if (q->request_fn) {
-                struct request_list *rl;
-                blk_queue_for_each_rl(rl, q)
-                        for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
-                                wake_up_all(&rl->wait[i]);
-        }
-}
-/**
- * blk_queue_bypass_start - enter queue bypass mode
- * @q: queue of interest
- *
- * In bypass mode, only the dispatch FIFO queue of @q is used.  This
- * function makes @q enter bypass mode and drains all requests which were
- * throttled or issued before.  On return, it's guaranteed that no request
- * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
- * inside queue or RCU read lock.
- */
-void blk_queue_bypass_start(struct request_queue *q)
-{
-        bool drain;
-        spin_lock_irq(q->queue_lock);
-        drain = !q->bypass_depth++;
-        queue_flag_set(QUEUE_FLAG_BYPASS, q);
-        spin_unlock_irq(q->queue_lock);
-        if (drain) {
-                spin_lock_irq(q->queue_lock);
-                __blk_drain_queue(q, false);
-                spin_unlock_irq(q->queue_lock);
-                /* ensure blk_queue_bypass() is %true inside RCU read lock */
-                synchronize_rcu();
-        }
-}
-EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
-/**
- * blk_queue_bypass_end - leave queue bypass mode
- * @q: queue of interest
- *
- * Leave bypass mode and restore the normal queueing behavior.
- */
-void blk_queue_bypass_end(struct request_queue *q)
-{
-        spin_lock_irq(q->queue_lock);
-        if (!--q->bypass_depth)
-                queue_flag_clear(QUEUE_FLAG_BYPASS, q);
-        WARN_ON_ONCE(q->bypass_depth < 0);
-        spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
-/**
- * blk_cleanup_queue - shutdown a request queue
- * @q: request queue to shutdown
- *
- * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
- * put it.  All future requests will be failed immediately with -ENODEV.
 */
 void blk_cleanup_queue(struct request_queue *q)
 {
-        spinlock_t *lock = q->queue_lock;
-        /* mark @q DYING, no new request or merges will be allowed afterwards */
-        mutex_lock(&q->sysfs_lock);
-        queue_flag_set_unlocked(QUEUE_FLAG_DYING, q);
-        spin_lock_irq(lock);
-        /*
-         * A dying queue is permanently in bypass mode till released.  Note
-         * that, unlike blk_queue_bypass_start(), we aren't performing
-         * synchronize_rcu() after entering bypass mode to avoid the delay
-         * as some drivers create and destroy a lot of queues while
-         * probing.  This is still safe because blk_release_queue() will be
-         * called only after the queue refcnt drops to zero and nothing,
-         * RCU or not, would be traversing the queue by then.
-         */
-        q->bypass_depth++;
-        queue_flag_set(QUEUE_FLAG_BYPASS, q);
-        queue_flag_set(QUEUE_FLAG_NOMERGES, q);
-        queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
-        queue_flag_set(QUEUE_FLAG_DYING, q);
-        spin_unlock_irq(lock);
-        mutex_unlock(&q->sysfs_lock);
        /*
-         * Drain all requests queued before DYING marking. Set DEAD flag to
+         * We know we have process context here, so we can be a little
-         * prevent that q->request_fn() gets invoked after draining finished.
+         * cautious and ensure that pending block actions on this device
+         * are done before moving on. Going into this function, we should
+         * not have processes doing IO to this device.
         */
-        spin_lock_irq(lock);
+        blk_sync_queue(q);
-        __blk_drain_queue(q, true);
-        queue_flag_set(QUEUE_FLAG_DEAD, q);
-        spin_unlock_irq(lock);
-        /* @q won't process any more request, flush async actions */
        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
-        blk_sync_queue(q);
+        mutex_lock(&q->sysfs_lock);
+        queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
+        mutex_unlock(&q->sysfs_lock);
-        spin_lock_irq(lock);
        if (q->queue_lock != &q->__queue_lock)
                q->queue_lock = &q->__queue_lock;
-        spin_unlock_irq(lock);
-        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
-int blk_init_rl(struct request_list *rl, struct request_queue *q,
+static int blk_init_free_list(struct request_queue *q)
-                gfp_t gfp_mask)
 {
+        struct request_list *rl = &q->rq;
        if (unlikely(rl->rq_pool))
                return 0;
-        rl->q = q;
        rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
        rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
+        rl->elvpriv = 0;
        init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
        init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
        rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-                                          mempool_free_slab, request_cachep,
+                                mempool_free_slab, request_cachep, q->node);
-                                          gfp_mask, q->node);
        if (!rl->rq_pool)
                return -ENOMEM;
        return 0;
 }
-void blk_exit_rl(struct request_list *rl)
-{
-        if (rl->rq_pool)
-                mempool_destroy(rl->rq_pool);
-}
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
-        return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
+        return blk_alloc_queue_node(gfp_mask, -1);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
@@ -604,10 +413,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        if (!q)
                return NULL;
-        q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
-        if (q->id < 0)
-                goto fail_q;
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
        q->backing_dev_info.state = 0;
@@ -616,18 +421,20 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        q->node = node_id;
        err = bdi_init(&q->backing_dev_info);
-        if (err)
+        if (err) {
-                goto fail_id;
+                kmem_cache_free(blk_requestq_cachep, q);
+                return NULL;
+        }
+        if (blk_throtl_init(q)) {
+                kmem_cache_free(blk_requestq_cachep, q);
+                return NULL;
+        }
        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                    laptop_mode_timer_fn, (unsigned long) q);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
-        INIT_LIST_HEAD(&q->queue_head);
        INIT_LIST_HEAD(&q->timeout_list);
-        INIT_LIST_HEAD(&q->icq_list);
-#ifdef CONFIG_BLK_CGROUP
-        INIT_LIST_HEAD(&q->blkg_list);
-#endif
        INIT_LIST_HEAD(&q->flush_queue[0]);
        INIT_LIST_HEAD(&q->flush_queue[1]);
        INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -644,25 +451,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
         */
        q->queue_lock = &q->__queue_lock;
-        /*
-         * A queue starts its life with bypass turned on to avoid
-         * unnecessary bypass on/off overhead and nasty surprises during
-         * init.  The initial bypass will be finished when the queue is
-         * registered by blk_register_queue().
-         */
-        q->bypass_depth = 1;
-        __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
-        if (blkcg_init_queue(q))
-                goto fail_id;
        return q;
-fail_id:
-        ida_simple_remove(&blk_queue_ida, q->id);
-fail_q:
-        kmem_cache_free(blk_requestq_cachep, q);
-        return NULL;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -701,7 +490,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
-        return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
+        return blk_init_queue_node(rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_queue);
@@ -729,13 +518,13 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
        if (!q)
                return NULL;
-        if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
+        if (blk_init_free_list(q))
                return NULL;
        q->request_fn           = rfn;
        q->prep_rq_fn           = NULL;
        q->unprep_rq_fn         = NULL;
-        q->queue_flags          |= QUEUE_FLAG_DEFAULT;
+        q->queue_flags          = QUEUE_FLAG_DEFAULT;
        /* Override internal queue lock with supplied lock pointer */
        if (lock)
@@ -744,37 +533,61 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
        /*
         * This also sets hw/phys segments, boundary and size
         */
-        blk_queue_make_request(q, blk_queue_bio);
+        blk_queue_make_request(q, __make_request);
        q->sg_reserved_size = INT_MAX;
-        /* init elevator */
+        /*
-        if (elevator_init(q, NULL))
+         * all done
-                return NULL;
+         */
-        return q;
+        if (!elevator_init(q, NULL)) {
+                blk_queue_congestion_threshold(q);
+                return q;
+        }
+        return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
-bool blk_get_queue(struct request_queue *q)
+int blk_get_queue(struct request_queue *q)
 {
-        if (likely(!blk_queue_dying(q))) {
+        if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-                __blk_get_queue(q);
+                kobject_get(&q->kobj);
-                return true;
+                return 0;
        }
-        return false;
+        return 1;
 }
 EXPORT_SYMBOL(blk_get_queue);
-static inline void blk_free_request(struct request_list *rl, struct request *rq)
+static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
-        if (rq->cmd_flags & REQ_ELVPRIV) {
+        if (rq->cmd_flags & REQ_ELVPRIV)
-                elv_put_request(rl->q, rq);
+                elv_put_request(q, rq);
-                if (rq->elv.icq)
+        mempool_free(rq, q->rq.rq_pool);
-                        put_io_context(rq->elv.icq->ioc);
+}
+static struct request *
+blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
+{
+        struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+        if (!rq)
+                return NULL;
+        blk_rq_init(q, rq);
+        rq->cmd_flags = flags | REQ_ALLOCED;
+        if (priv) {
+                if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+                        mempool_free(rq, q->rq.rq_pool);
+                        return NULL;
+                }
+                rq->cmd_flags |= REQ_ELVPRIV;
        }
-        mempool_free(rq, rl->rq_pool);
+        return rq;
 }
 /*
@@ -811,23 +624,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
        ioc->last_waited = jiffies;
 }
-static void __freed_request(struct request_list *rl, int sync)
+static void __freed_request(struct request_queue *q, int sync)
 {
-        struct request_queue *q = rl->q;
+        struct request_list *rl = &q->rq;
-        /*
+        if (rl->count[sync] < queue_congestion_off_threshold(q))
-         * bdi isn't aware of blkcg yet.  As all async IOs end up root
-         * blkcg anyway, just use root blkcg state.
-         */
-        if (rl == &q->root_rl &&
-            rl->count[sync] < queue_congestion_off_threshold(q))
                blk_clear_queue_congested(q, sync);
        if (rl->count[sync] + 1 <= q->nr_requests) {
                if (waitqueue_active(&rl->wait[sync]))
                        wake_up(&rl->wait[sync]);
-                blk_clear_rl_full(rl, sync);
+                blk_clear_queue_full(q, sync);
        }
 }
@@ -835,20 +643,18 @@ static void __freed_request(struct request_list *rl, int sync)
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
-static void freed_request(struct request_list *rl, unsigned int flags)
+static void freed_request(struct request_queue *q, int sync, int priv)
 {
-        struct request_queue *q = rl->q;
+        struct request_list *rl = &q->rq;
-        int sync = rw_is_sync(flags);
-        q->nr_rqs[sync]--;
        rl->count[sync]--;
-        if (flags & REQ_ELVPRIV)
+        if (priv)
-                q->nr_rqs_elvpriv--;
+                rl->elvpriv--;
-        __freed_request(rl, sync);
+        __freed_request(q, sync);
        if (unlikely(rl->starved[sync ^ 1]))
-                __freed_request(rl, sync ^ 1);
+                __freed_request(q, sync ^ 1);
 }
 /*
@@ -870,49 +676,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
        return true;
 }
-/**
+/*
- * rq_ioc - determine io_context for request allocation
+ * Get a free request, queue_lock must be held.
- * @bio: request being allocated is for this bio (can be %NULL)
+ * Returns NULL on failure, with queue_lock held.
- *
+ * Returns !NULL on success, with queue_lock *not held*.
- * Determine io_context to use for request allocation for @bio.  May return
- * %NULL if %current->io_context doesn't exist.
- */
-static struct io_context *rq_ioc(struct bio *bio)
-{
-#ifdef CONFIG_BLK_CGROUP
-        if (bio && bio->bi_ioc)
-                return bio->bi_ioc;
-#endif
-        return current->io_context;
-}
-/**
- * __get_request - get a free request
- * @rl: request list to allocate from
- * @rw_flags: RW and SYNC flags
- * @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
- *
- * Get a free request from @q.  This function may fail under memory
- * pressure or if @q is dead.
- *
- * Must be callled with @q->queue_lock held and,
- * Returns %NULL on failure, with @q->queue_lock held.
- * Returns !%NULL on success, with @q->queue_lock *not held*.
 */
-static struct request *__get_request(struct request_list *rl, int rw_flags,
+static struct request *get_request(struct request_queue *q, int rw_flags,
-                                     struct bio *bio, gfp_t gfp_mask)
+                                   struct bio *bio, gfp_t gfp_mask)
 {
-        struct request_queue *q = rl->q;
+        struct request *rq = NULL;
-        struct request *rq;
+        struct request_list *rl = &q->rq;
-        struct elevator_type *et = q->elevator->type;
+        struct io_context *ioc = NULL;
-        struct io_context *ioc = rq_ioc(bio);
-        struct io_cq *icq = NULL;
        const bool is_sync = rw_is_sync(rw_flags) != 0;
-        int may_queue;
+        int may_queue, priv = 0;
-        if (unlikely(blk_queue_dying(q)))
-                return NULL;
        may_queue = elv_may_queue(q, rw_flags);
        if (may_queue == ELV_MQUEUE_NO)
@@ -920,15 +696,16 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
        if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
                if (rl->count[is_sync]+1 >= q->nr_requests) {
+                        ioc = current_io_context(GFP_ATOMIC, q->node);
                        /*
                         * The queue will fill after this allocation, so set
                         * it as full, and mark this process as "batching".
                         * This process will be allowed to complete a batch of
                         * requests, others will be blocked.
                         */
-                        if (!blk_rl_full(rl, is_sync)) {
+                        if (!blk_queue_full(q, is_sync)) {
                                ioc_set_batching(q, ioc);
-                                blk_set_rl_full(rl, is_sync);
+                                blk_set_queue_full(q, is_sync);
                        } else {
                                if (may_queue != ELV_MQUEUE_MUST
                                                && !ioc_batching(q, ioc)) {
@@ -937,16 +714,11 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
                                         * process is not a "batcher", and not
                                         * exempted by the IO scheduler
                                         */
-                                        return NULL;
+                                        goto out;
                                }
                        }
                }
-                /*
+                blk_set_queue_congested(q, is_sync);
-                 * bdi isn't aware of blkcg yet.  As all async IOs end up
-                 * root blkcg anyway, just use root blkcg state.
-                 */
-                if (rl == &q->root_rl)
-                        blk_set_queue_congested(q, is_sync);
        }
        /*
@@ -955,60 +727,47 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
         * allocated with any setting of ->nr_requests
         */
        if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
-                return NULL;
+                goto out;
-        q->nr_rqs[is_sync]++;
        rl->count[is_sync]++;
        rl->starved[is_sync] = 0;
-        /*
+        if (blk_rq_should_init_elevator(bio)) {
-         * Decide whether the new request will be managed by elevator.  If
+                priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-         * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
+                if (priv)
-         * prevent the current elevator from being destroyed until the new
+                        rl->elvpriv++;
-         * request is freed.  This guarantees icq's won't be destroyed and
-         * makes creating new ones safe.
-         *
-         * Also, lookup icq while holding queue_lock.  If it doesn't exist,
-         * it will be created after releasing queue_lock.
-         */
-        if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
-                rw_flags |= REQ_ELVPRIV;
-                q->nr_rqs_elvpriv++;
-                if (et->icq_cache && ioc)
-                        icq = ioc_lookup_icq(ioc, q);
        }
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
        spin_unlock_irq(q->queue_lock);
-        /* allocate and init request */
+        rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
-        rq = mempool_alloc(rl->rq_pool, gfp_mask);
+        if (unlikely(!rq)) {
-        if (!rq)
+                /*
-                goto fail_alloc;
+                 * Allocation failed presumably due to memory. Undo anything
+                 * we might have messed up.
-        blk_rq_init(q, rq);
+                 *
-        blk_rq_set_rl(rq, rl);
+                 * Allocating task should really be put onto the front of the
-        rq->cmd_flags = rw_flags | REQ_ALLOCED;
+                 * wait queue, but this is pretty rare.
+                 */
-        /* init elvpriv */
+                spin_lock_irq(q->queue_lock);
-        if (rw_flags & REQ_ELVPRIV) {
+                freed_request(q, is_sync, priv);
-                if (unlikely(et->icq_cache && !icq)) {
-                        if (ioc)
-                                icq = ioc_create_icq(ioc, q, gfp_mask);
-                        if (!icq)
-                                goto fail_elvpriv;
-                }
-                rq->elv.icq = icq;
+                /*
-                if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
+                 * in the very unlikely event that allocation failed and no
-                        goto fail_elvpriv;
+                 * requests for this direction was pending, mark us starved
+                 * so that freeing of a request in the other direction will
+                 * notice us. another possible fix would be to split the
+                 * rq mempool into READ and WRITE
+                 */
+rq_starved:
+                if (unlikely(rl->count[is_sync] == 0))
+                        rl->starved[is_sync] = 1;
-                /* @rq->elv.icq holds io_context until @rq is freed */
+                goto out;
-                if (icq)
-                        get_io_context(icq->ioc);
        }
-out:
        /*
         * ioc may be NULL here, and ioc_batching will be false. That's
         * OK, if the queue is under the request limit then requests need
@@ -1019,118 +778,71 @@ out:
                ioc->nr_batch_requests--;
        trace_block_getrq(q, bio, rw_flags & 1);
+out:
        return rq;
-fail_elvpriv:
-        /*
-         * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
-         * and may fail indefinitely under memory pressure and thus
-         * shouldn't stall IO.  Treat this request as !elvpriv.  This will
-         * disturb iosched and blkcg but weird is bettern than dead.
-         */
-        printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
-                           dev_name(q->backing_dev_info.dev));
-        rq->cmd_flags &= ~REQ_ELVPRIV;
-        rq->elv.icq = NULL;
-        spin_lock_irq(q->queue_lock);
-        q->nr_rqs_elvpriv--;
-        spin_unlock_irq(q->queue_lock);
-        goto out;
-fail_alloc:
-        /*
-         * Allocation failed presumably due to memory. Undo anything we
-         * might have messed up.
-         *
-         * Allocating task should really be put onto the front of the wait
-         * queue, but this is pretty rare.
-         */
-        spin_lock_irq(q->queue_lock);
-        freed_request(rl, rw_flags);
-        /*
-         * in the very unlikely event that allocation failed and no
-         * requests for this direction was pending, mark us starved so that
-         * freeing of a request in the other direction will notice
-         * us. another possible fix would be to split the rq mempool into
-         * READ and WRITE
-         */
-rq_starved:
-        if (unlikely(rl->count[is_sync] == 0))
-                rl->starved[is_sync] = 1;
-        return NULL;
 }
-/**
+/*
- * get_request - get a free request
+ * No available requests for this queue, wait for some requests to become
- * @q: request_queue to allocate request from
+ * available.
- * @rw_flags: RW and SYNC flags
+ *
- * @bio: bio to allocate request for (can be %NULL)
+ * Called with q->queue_lock held, and returns with it unlocked.
- * @gfp_mask: allocation mask
- *
- * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
- * function keeps retrying under memory pressure and fails iff @q is dead.
- *
- * Must be callled with @q->queue_lock held and,
- * Returns %NULL on failure, with @q->queue_lock held.
- * Returns !%NULL on success, with @q->queue_lock *not held*.
 */
-static struct request *get_request(struct request_queue *q, int rw_flags,
+static struct request *get_request_wait(struct request_queue *q, int rw_flags,
-                                   struct bio *bio, gfp_t gfp_mask)
+                                        struct bio *bio)
 {
        const bool is_sync = rw_is_sync(rw_flags) != 0;
-        DEFINE_WAIT(wait);
-        struct request_list *rl;
        struct request *rq;
-        rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
+        rq = get_request(q, rw_flags, bio, GFP_NOIO);
-retry:
+        while (!rq) {
-        rq = __get_request(rl, rw_flags, bio, gfp_mask);
+                DEFINE_WAIT(wait);
-        if (rq)
+                struct io_context *ioc;
-                return rq;
+                struct request_list *rl = &q->rq;
-        if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+                prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
-                blk_put_rl(rl);
+                                TASK_UNINTERRUPTIBLE);
-                return NULL;
-        }
-        /* wait on @rl and retry */
+                trace_block_sleeprq(q, bio, rw_flags & 1);
-        prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
-                                  TASK_UNINTERRUPTIBLE);
-        trace_block_sleeprq(q, bio, rw_flags & 1);
+                spin_unlock_irq(q->queue_lock);
+                io_schedule();
-        spin_unlock_irq(q->queue_lock);
+                /*
-        io_schedule();
+                 * After sleeping, we become a "batching" process and
+                 * will be able to allocate at least one request, and
+                 * up to a big batch of them for a small period time.
+                 * See ioc_batching, ioc_set_batching
+                 */
+                ioc = current_io_context(GFP_NOIO, q->node);
+                ioc_set_batching(q, ioc);
-        /*
+                spin_lock_irq(q->queue_lock);
-         * After sleeping, we become a "batching" process and will be able
+                finish_wait(&rl->wait[is_sync], &wait);
-         * to allocate at least one request, and up to a big batch of them
-         * for a small period time.  See ioc_batching, ioc_set_batching
-         */
-        ioc_set_batching(q, current->io_context);
-        spin_lock_irq(q->queue_lock);
+                rq = get_request(q, rw_flags, bio, GFP_NOIO);
-        finish_wait(&rl->wait[is_sync], &wait);
+        };
-        goto retry;
+        return rq;
 }
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
        struct request *rq;
-        BUG_ON(rw != READ && rw != WRITE);
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+                return NULL;
-        /* create ioc upfront */
+        BUG_ON(rw != READ && rw != WRITE);
-        create_io_context(gfp_mask, q->node);
        spin_lock_irq(q->queue_lock);
-        rq = get_request(q, rw, NULL, gfp_mask);
+        if (gfp_mask & __GFP_WAIT) {
-        if (!rq)
+                rq = get_request_wait(q, rw, NULL);
-                spin_unlock_irq(q->queue_lock);
+        } else {
+                rq = get_request(q, rw, NULL, gfp_mask);
+                if (!rq)
+                        spin_unlock_irq(q->queue_lock);
+        }
        /* q->queue_lock is unlocked at this point */
        return rq;
@@ -1224,6 +936,54 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
        __elv_add_request(q, rq, where);
 }
+/**
+ * blk_insert_request - insert a special request into a request queue
+ * @q:          request queue where request should be inserted
+ * @rq:         request to be inserted
+ * @at_head:    insert request at head or tail of queue
+ * @data:       private data
+ *
+ * Description:
+ *    Many block devices need to execute commands asynchronously, so they don't
+ *    block the whole kernel from preemption during request execution.  This is
+ *    accomplished normally by inserting aritficial requests tagged as
+ *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
+ *    be scheduled for actual execution by the request queue.
+ *
+ *    We have the option of inserting the head or the tail of the queue.
+ *    Typically we use the tail for new ioctls and so forth.  We use the head
+ *    of the queue for things like a QUEUE_FULL message from a device, or a
+ *    host that is unable to accept a particular command.
+ */
+void blk_insert_request(struct request_queue *q, struct request *rq,
+                        int at_head, void *data)
+{
+        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+        unsigned long flags;
+        /*
+         * tell I/O scheduler that this isn't a regular read/write (ie it
+         * must not attempt merges on this) and that it acts as a soft
+         * barrier
+         */
+        rq->cmd_type = REQ_TYPE_SPECIAL;
+        rq->special = data;
+        spin_lock_irqsave(q->queue_lock, flags);
+        /*
+         * If command is tagged, release the tag
+         */
+        if (blk_rq_tagged(rq))
+                blk_queue_end_tag(q, rq);
+        add_acct_request(q, rq, where);
+        __blk_run_queue(q);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL(blk_insert_request);
 static void part_round_stats_single(int cpu, struct hd_struct *part,
                                    unsigned long now)
 {
@@ -1284,15 +1044,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         * it didn't come out of our reserved rq pools
         */
        if (req->cmd_flags & REQ_ALLOCED) {
-                unsigned int flags = req->cmd_flags;
+                int is_sync = rq_is_sync(req) != 0;
-                struct request_list *rl = blk_rq_rl(req);
+                int priv = req->cmd_flags & REQ_ELVPRIV;
                BUG_ON(!list_empty(&req->queuelist));
                BUG_ON(!hlist_unhashed(&req->hash));
-                blk_free_request(rl, req);
+                blk_free_request(q, req);
-                freed_request(rl, flags);
+                freed_request(q, is_sync, priv);
-                blk_put_rl(rl);
        }
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1359,6 +1118,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
        drive_stat_acct(req, 0);
+        elv_bio_merged(q, req, bio);
        return true;
 }
@@ -1389,34 +1149,22 @@ static bool bio_attempt_front_merge(struct request_queue *q,
        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
        drive_stat_acct(req, 0);
+        elv_bio_merged(q, req, bio);
        return true;
 }
-/**
+/*
- * attempt_plug_merge - try to merge with %current's plugged list
+ * Attempts to merge with the plugged list in the current process. Returns
- * @q: request_queue new bio is being queued at
+ * true if merge was successful, otherwise false.
- * @bio: new bio being queued
- * @request_count: out parameter for number of traversed plugged requests
- *
- * Determine whether @bio being queued on @q can be merged with a request
- * on %current's plugged list.  Returns %true if merge was successful,
- * otherwise %false.
- *
- * Plugging coalesces IOs from the same issuer for the same purpose without
- * going through @q->queue_lock.  As such it's more of an issuing mechanism
- * than scheduling, and the request, while may have elvpriv data, is not
- * added on the elevator at this point.  In addition, we don't have
- * reliable access to the elevator outside queue lock.  Only check basic
- * merging parameters without querying the elevator.
 */
-static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
+static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
-                               unsigned int *request_count)
+                               struct bio *bio, unsigned int *request_count)
 {
        struct blk_plug *plug;
        struct request *rq;
        bool ret = false;
-        plug = current->plug;
+        plug = tsk->plug;
        if (!plug)
                goto out;
        *request_count = 0;
@@ -1424,13 +1172,12 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
        list_for_each_entry_reverse(rq, &plug->list, queuelist) {
                int el_ret;
-                if (rq->q == q)
+                (*request_count)++;
-                        (*request_count)++;
-                if (rq->q != q || !blk_rq_merge_ok(rq, bio))
+                if (rq->q != q)
                        continue;
-                el_ret = blk_try_merge(rq, bio);
+                el_ret = elv_try_merge(rq, bio);
                if (el_ret == ELEVATOR_BACK_MERGE) {
                        ret = bio_attempt_back_merge(q, rq, bio);
                        if (ret)
@@ -1447,6 +1194,7 @@ out:
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
+        req->cpu = bio->bi_comp_cpu;
        req->cmd_type = REQ_TYPE_FS;
        req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
@@ -1459,7 +1207,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        blk_rq_bio_prep(req->q, req, bio);
 }
-void blk_queue_bio(struct request_queue *q, struct bio *bio)
+static int __make_request(struct request_queue *q, struct bio *bio)
 {
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
        struct blk_plug *plug;
@@ -1484,22 +1232,20 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
         * Check if we can merge with the plugged list before grabbing
         * any locks.
         */
-        if (attempt_plug_merge(q, bio, &request_count))
+        if (attempt_plug_merge(current, q, bio, &request_count))
-                return;
+                goto out;
        spin_lock_irq(q->queue_lock);
        el_ret = elv_merge(q, &req, bio);
        if (el_ret == ELEVATOR_BACK_MERGE) {
                if (bio_attempt_back_merge(q, req, bio)) {
-                        elv_bio_merged(q, req, bio);
                        if (!attempt_back_merge(q, req))
                                elv_merged_request(q, req, el_ret);
                        goto out_unlock;
                }
        } else if (el_ret == ELEVATOR_FRONT_MERGE) {
                if (bio_attempt_front_merge(q, req, bio)) {
-                        elv_bio_merged(q, req, bio);
                        if (!attempt_front_merge(q, req))
                                elv_merged_request(q, req, el_ret);
                        goto out_unlock;
@@ -1520,11 +1266,7 @@ get_rq:
         * Grab a free request. This is might sleep but can not fail.
         * Returns with the queue unlocked.
         */
-        req = get_request(q, rw_flags, bio, GFP_NOIO);
+        req = get_request_wait(q, rw_flags, bio);
-        if (unlikely(!req)) {
-                bio_endio(bio, -ENODEV);        /* @q is dead */
-                goto out_unlock;
-        }
        /*
         * After dropping the lock and possibly sleeping here, our request
@@ -1534,7 +1276,8 @@ get_rq:
         */
        init_request_from_bio(req, bio);
-        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
+        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
+            bio_flagged(bio, BIO_CPU_AFFINE))
                req->cpu = raw_smp_processor_id();
        plug = current->plug;
@@ -1547,19 +1290,15 @@ get_rq:
                 */
                if (list_empty(&plug->list))
                        trace_block_plug(q);
-                else {
+                else if (!plug->should_sort) {
-                        if (!plug->should_sort) {
+                        struct request *__rq;
-                                struct request *__rq;
-                                __rq = list_entry_rq(plug->list.prev);
+                        __rq = list_entry_rq(plug->list.prev);
-                                if (__rq->q != q)
+                        if (__rq->q != q)
-                                        plug->should_sort = 1;
+                                plug->should_sort = 1;
-                        }
-                        if (request_count >= BLK_MAX_REQUEST_COUNT) {
-                                blk_flush_plug_list(plug, false);
-                                trace_block_plug(q);
-                        }
                }
+                if (request_count >= BLK_MAX_REQUEST_COUNT)
+                        blk_flush_plug_list(plug, false);
                list_add_tail(&req->queuelist, &plug->list);
                drive_stat_acct(req, 1);
        } else {
@@ -1569,8 +1308,9 @@ get_rq:
 out_unlock:
                spin_unlock_irq(q->queue_lock);
        }
+out:
+        return 0;
 }
-EXPORT_SYMBOL_GPL(blk_queue_bio);       /* for device mapper only */
 /*
 * If bio->bi_dev is a partition, remap the location
@@ -1669,147 +1409,165 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
        return 0;
 }
-static noinline_for_stack bool
+/**
-generic_make_request_checks(struct bio *bio)
+ * generic_make_request - hand a buffer to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * generic_make_request() is used to make I/O requests of block
+ * devices. It is passed a &struct bio, which describes the I/O that needs
+ * to be done.
+ *
+ * generic_make_request() does not return any status.  The
+ * success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the bio->bi_end_io
+ * function described (one day) else where.
+ *
+ * The caller of generic_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ * set to describe the device address, and the
+ * bi_end_io and optionally bi_private are set to describe how
+ * completion notification should be signaled.
+ *
+ * generic_make_request and the drivers it calls may use bi_next if this
+ * bio happens to be merged with someone else, and may change bi_dev and
+ * bi_sector for remaps as it sees fit.  So the values of these fields
+ * should NOT be depended on after the call to generic_make_request.
+ */
+static inline void __generic_make_request(struct bio *bio)
 {
        struct request_queue *q;
-        int nr_sectors = bio_sectors(bio);
+        sector_t old_sector;
+        int ret, nr_sectors = bio_sectors(bio);
+        dev_t old_dev;
        int err = -EIO;
-        char b[BDEVNAME_SIZE];
-        struct hd_struct *part;
        might_sleep();
        if (bio_check_eod(bio, nr_sectors))
                goto end_io;
-        q = bdev_get_queue(bio->bi_bdev);
+        /*
-        if (unlikely(!q)) {
+         * Resolve the mapping until finished. (drivers are
-                printk(KERN_ERR
+         * still free to implement/resolve their own stacking
-                       "generic_make_request: Trying to access "
+         * by explicitly returning 0)
-                        "nonexistent block-device %s (%Lu)\n",
+         *
-                        bdevname(bio->bi_bdev, b),
+         * NOTE: we don't repeat the blk_size check for each new device.
-                        (long long) bio->bi_sector);
+         * Stacking drivers are expected to know what they are doing.
-                goto end_io;
+         */
-        }
+        old_sector = -1;
+        old_dev = 0;
+        do {
+                char b[BDEVNAME_SIZE];
+                struct hd_struct *part;
-        if (likely(bio_is_rw(bio) &&
+                q = bdev_get_queue(bio->bi_bdev);
-                   nr_sectors > queue_max_hw_sectors(q))) {
+                if (unlikely(!q)) {
-                printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+                        printk(KERN_ERR
-                       bdevname(bio->bi_bdev, b),
+                               "generic_make_request: Trying to access "
-                       bio_sectors(bio),
+                                "nonexistent block-device %s (%Lu)\n",
-                       queue_max_hw_sectors(q));
+                                bdevname(bio->bi_bdev, b),
-                goto end_io;
+                                (long long) bio->bi_sector);
-        }
+                        goto end_io;
+                }
-        part = bio->bi_bdev->bd_part;
+                if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
-        if (should_fail_request(part, bio->bi_size) ||
+                             nr_sectors > queue_max_hw_sectors(q))) {
-            should_fail_request(&part_to_disk(part)->part0,
+                        printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-                                bio->bi_size))
+                               bdevname(bio->bi_bdev, b),
-                goto end_io;
+                               bio_sectors(bio),
+                               queue_max_hw_sectors(q));
+                        goto end_io;
+                }
-        /*
+                if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-         * If this device has partitions, remap block n
+                        goto end_io;
-         * of partition p to block n+start(p) of the disk.
-         */
-        blk_partition_remap(bio);
-        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+                part = bio->bi_bdev->bd_part;
-                goto end_io;
+                if (should_fail_request(part, bio->bi_size) ||
+                    should_fail_request(&part_to_disk(part)->part0,
+                                        bio->bi_size))
+                        goto end_io;
-        if (bio_check_eod(bio, nr_sectors))
+                /*
-                goto end_io;
+                 * If this device has partitions, remap block n
+                 * of partition p to block n+start(p) of the disk.
+                 */
+                blk_partition_remap(bio);
-        /*
+                if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
-         * Filter flush bio's early so that make_request based
+                        goto end_io;
-         * drivers without flush support don't have to worry
-         * about them.
+                if (old_sector != -1)
-         */
+                        trace_block_bio_remap(q, bio, old_dev, old_sector);
-        if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
-                bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+                old_sector = bio->bi_sector;
-                if (!nr_sectors) {
+                old_dev = bio->bi_bdev->bd_dev;
-                        err = 0;
+                if (bio_check_eod(bio, nr_sectors))
                        goto end_io;
+                /*
+                 * Filter flush bio's early so that make_request based
+                 * drivers without flush support don't have to worry
+                 * about them.
+                 */
+                if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+                        bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+                        if (!nr_sectors) {
+                                err = 0;
+                                goto end_io;
+                        }
                }
-        }
-        if ((bio->bi_rw & REQ_DISCARD) &&
+                if ((bio->bi_rw & REQ_DISCARD) &&
-            (!blk_queue_discard(q) ||
+                    (!blk_queue_discard(q) ||
-             ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+                     ((bio->bi_rw & REQ_SECURE) &&
-                err = -EOPNOTSUPP;
+                      !blk_queue_secdiscard(q)))) {
-                goto end_io;
+                        err = -EOPNOTSUPP;
-        }
+                        goto end_io;
+                }
-        if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+                if (blk_throtl_bio(q, &bio))
-                err = -EOPNOTSUPP;
+                        goto end_io;
-                goto end_io;
-        }
-        /*
+                /*
-         * Various block parts want %current->io_context and lazy ioc
+                 * If bio = NULL, bio has been throttled and will be submitted
-         * allocation ends up trading a lot of pain for a small amount of
+                 * later.
-         * memory.  Just allocate it upfront.  This may fail and block
+                 */
-         * layer knows how to live with it.
+                if (!bio)
-         */
+                        break;
-        create_io_context(GFP_ATOMIC, q->node);
-        if (blk_throtl_bio(q, bio))
+                trace_block_bio_queue(q, bio);
-                return false;   /* throttled, will be resubmitted later */
-        trace_block_bio_queue(q, bio);
+                ret = q->make_request_fn(q, bio);
-        return true;
+        } while (ret);
+        return;
 end_io:
        bio_endio(bio, err);
-        return false;
 }
-/**
+/*
- * generic_make_request - hand a buffer to its device driver for I/O
+ * We only want one ->make_request_fn to be active at a time,
- * @bio:  The bio describing the location in memory and on the device.
+ * else stack usage with stacked devices could be a problem.
- *
+ * So use current->bio_list to keep a list of requests
- * generic_make_request() is used to make I/O requests of block
+ * submited by a make_request_fn function.
- * devices. It is passed a &struct bio, which describes the I/O that needs
+ * current->bio_list is also used as a flag to say if
- * to be done.
+ * generic_make_request is currently active in this task or not.
- *
+ * If it is NULL, then no make_request is active.  If it is non-NULL,
- * generic_make_request() does not return any status.  The
+ * then a make_request is active, and new requests should be added
- * success/failure status of the request, along with notification of
+ * at the tail
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may resubmit the bio to
- * a lower device by calling into generic_make_request recursively, which
- * means the bio should NOT be touched after the call to ->make_request_fn.
 */
 void generic_make_request(struct bio *bio)
 {
        struct bio_list bio_list_on_stack;
-        if (!generic_make_request_checks(bio))
-                return;
-        /*
-         * We only want one ->make_request_fn to be active at a time, else
-         * stack usage with stacked devices could be a problem.  So use
-         * current->bio_list to keep a list of requests submited by a
-         * make_request_fn function.  current->bio_list is also used as a
-         * flag to say if generic_make_request is currently active in this
-         * task or not.  If it is NULL, then no make_request is active.  If
-         * it is non-NULL, then a make_request is active, and new requests
-         * should be added at the tail
-         */
        if (current->bio_list) {
+                /* make_request is active */
                bio_list_add(current->bio_list, bio);
                return;
        }
        /* following loop may be a bit non-obvious, and so deserves some
         * explanation.
         * Before entering the loop, bio->bi_next is NULL (as all callers
@@ -1817,21 +1575,22 @@ void generic_make_request(struct bio *bio)
         * We pretend that we have just taken it off a longer list, so
         * we assign bio_list to a pointer to the bio_list_on_stack,
         * thus initialising the bio_list of new bios to be
-         * added.  ->make_request() may indeed add some more bios
+         * added.  __generic_make_request may indeed add some more bios
         * through a recursive call to generic_make_request.  If it
         * did, we find a non-NULL value in bio_list and re-enter the loop
         * from the top.  In this case we really did just take the bio
         * of the top of the list (no pretending) and so remove it from
-         * bio_list, and call into ->make_request() again.
+         * bio_list, and call into __generic_make_request again.
+         *
+         * The loop was structured like this to make only one call to
+         * __generic_make_request (which is important as it is large and
+         * inlined) and to keep the structure simple.
         */
        BUG_ON(bio->bi_next);
        bio_list_init(&bio_list_on_stack);
        current->bio_list = &bio_list_on_stack;
        do {
-                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+                __generic_make_request(bio);
-                q->make_request_fn(q, bio);
                bio = bio_list_pop(current->bio_list);
        } while (bio);
        current->bio_list = NULL; /* deactivate */
@@ -1850,20 +1609,15 @@ EXPORT_SYMBOL(generic_make_request);
 */
 void submit_bio(int rw, struct bio *bio)
 {
+        int count = bio_sectors(bio);
        bio->bi_rw |= rw;
        /*
         * If it's a regular read/write or a barrier with data attached,
         * go through the normal accounting stuff before submission.
         */
-        if (bio_has_data(bio)) {
+        if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
-                unsigned int count;
-                if (unlikely(rw & REQ_WRITE_SAME))
-                        count = bdev_logical_block_size(bio->bi_bdev) >> 9;
-                else
-                        count = bio_sectors(bio);
                if (rw & WRITE) {
                        count_vm_events(PGPGOUT, count);
                } else {
@@ -1909,10 +1663,11 @@ EXPORT_SYMBOL(submit_bio);
 */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-        if (!rq_mergeable(rq))
+        if (rq->cmd_flags & REQ_DISCARD)
                return 0;
-        if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
+        if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
+            blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
                printk(KERN_ERR "%s: over max size limit.\n", __func__);
                return -EIO;
        }
@@ -1951,10 +1706,6 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
                return -EIO;
        spin_lock_irqsave(q->queue_lock, flags);
-        if (unlikely(blk_queue_dying(q))) {
-                spin_unlock_irqrestore(q->queue_lock, flags);
-                return -ENODEV;
-        }
        /*
         * Submitting request must be dequeued before calling this function
@@ -2296,11 +2047,9 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
                        error_type = "I/O";
                        break;
                }
-                printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
+                printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
-                                   error_type, req->rq_disk ?
+                       error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
-                                   req->rq_disk->disk_name : "?",
+                       (unsigned long long)blk_rq_pos(req));
-                                   (unsigned long long)blk_rq_pos(req));
        }
        blk_account_io_completion(req, nr_bytes);
@@ -2384,7 +2133,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
        req->buffer = bio_data(req->bio);
        /* update sector only for requests with clear definition of sector */
-        if (req->cmd_type == REQ_TYPE_FS)
+        if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
                req->__sector += total_bytes >> 9;
        /* mixed attributes always follow the first bio */
@@ -2825,10 +2574,16 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
        blk_rq_init(NULL, rq);
        __rq_for_each_bio(bio_src, rq_src) {
-                bio = bio_clone_bioset(bio_src, gfp_mask, bs);
+                bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
                if (!bio)
                        goto free_and_out;
+                __bio_clone(bio, bio_src);
+                if (bio_integrity(bio_src) &&
+                    bio_integrity_clone(bio, bio_src, gfp_mask, bs))
+                        goto free_and_out;
                if (bio_ctr && bio_ctr(bio, bio_src, data))
                        goto free_and_out;
@@ -2845,7 +2600,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 free_and_out:
        if (bio)
-                bio_put(bio);
+                bio_free(bio, bs);
        blk_rq_unprep_clone(rq);
        return -ENOMEM;
@@ -2867,20 +2622,6 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 #define PLUG_MAGIC      0x91827364
-/**
- * blk_start_plug - initialize blk_plug and track it inside the task_struct
- * @plug:       The &struct blk_plug that needs to be initialized
- *
- * Description:
- *   Tracking blk_plug inside the task_struct will help with auto-flushing the
- *   pending I/O should the task end up blocking between blk_start_plug() and
- *   blk_finish_plug(). This is important from a performance perspective, but
- *   also ensures that we don't deadlock. For instance, if the task is blocking
- *   for a memory allocation, memory reclaim could end up wanting to free a
- *   page belonging to that request that is currently residing in our private
- *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
- *   this kind of deadlock.
- */
 void blk_start_plug(struct blk_plug *plug)
 {
        struct task_struct *tsk = current;
@@ -2909,8 +2650,7 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
        struct request *rqa = container_of(a, struct request, queuelist);
        struct request *rqb = container_of(b, struct request, queuelist);
-        return !(rqa->q < rqb->q ||
+        return !(rqa->q <= rqb->q);
-                (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
 }
 /*
@@ -2925,55 +2665,39 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 {
        trace_block_unplug(q, depth, !from_schedule);
-        if (from_schedule)
+        /*
+         * If we are punting this to kblockd, then we can safely drop
+         * the queue_lock before waking kblockd (which needs to take
+         * this lock).
+         */
+        if (from_schedule) {
+                spin_unlock(q->queue_lock);
                blk_run_queue_async(q);
-        else
+        } else {
                __blk_run_queue(q);
-        spin_unlock(q->queue_lock);
+                spin_unlock(q->queue_lock);
+        }
 }
-static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
+static void flush_plug_callbacks(struct blk_plug *plug)
 {
        LIST_HEAD(callbacks);
-        while (!list_empty(&plug->cb_list)) {
+        if (list_empty(&plug->cb_list))
-                list_splice_init(&plug->cb_list, &callbacks);
+                return;
+        list_splice_init(&plug->cb_list, &callbacks);
-                while (!list_empty(&callbacks)) {
+        while (!list_empty(&callbacks)) {
-                        struct blk_plug_cb *cb = list_first_entry(&callbacks,
+                struct blk_plug_cb *cb = list_first_entry(&callbacks,
                                                          struct blk_plug_cb,
                                                          list);
-                        list_del(&cb->list);
+                list_del(&cb->list);
-                        cb->callback(cb, from_schedule);
+                cb->callback(cb);
-                }
        }
 }
-struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
-                                      int size)
-{
-        struct blk_plug *plug = current->plug;
-        struct blk_plug_cb *cb;
-        if (!plug)
-                return NULL;
-        list_for_each_entry(cb, &plug->cb_list, list)
-                if (cb->callback == unplug && cb->data == data)
-                        return cb;
-        /* Not currently on the callback list */
-        BUG_ON(size < sizeof(*cb));
-        cb = kzalloc(size, GFP_ATOMIC);
-        if (cb) {
-                cb->data = data;
-                cb->callback = unplug;
-                list_add(&cb->list, &plug->cb_list);
-        }
-        return cb;
-}
-EXPORT_SYMBOL(blk_check_plugged);
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
        struct request_queue *q;
@@ -2984,7 +2708,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
        BUG_ON(plug->magic != PLUG_MAGIC);
-        flush_plug_callbacks(plug, from_schedule);
+        flush_plug_callbacks(plug);
        if (list_empty(&plug->list))
                return;
@@ -3017,15 +2741,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                        depth = 0;
                        spin_lock(q->queue_lock);
                }
-                /*
-                 * Short-circuit if @q is dead
-                 */
-                if (unlikely(blk_queue_dying(q))) {
-                        __blk_end_request_all(rq, -ENODEV);
-                        continue;
-                }
                /*
                 * rq is already accounted, so use raw insert
                 */
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 74638ec234c..a1ebceb332f 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -43,42 +43,29 @@ static void blk_end_sync_rq(struct request *rq, int error)
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution.  Don't wait for completion.
- *
- * Note:
- *    This function will invoke @done directly if the queue is dead.
 */
 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
                           struct request *rq, int at_head,
                           rq_end_io_fn *done)
 {
        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-        bool is_pm_resume;
-        WARN_ON(irqs_disabled());
-        rq->rq_disk = bd_disk;
-        rq->end_io = done;
-        /*
-         * need to check this before __blk_run_queue(), because rq can
-         * be freed before that returns.
-         */
-        is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-        spin_lock_irq(q->queue_lock);
-        if (unlikely(blk_queue_dying(q))) {
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
                rq->errors = -ENXIO;
                if (rq->end_io)
                        rq->end_io(rq, rq->errors);
-                spin_unlock_irq(q->queue_lock);
                return;
        }
+        rq->rq_disk = bd_disk;
+        rq->end_io = done;
+        WARN_ON(irqs_disabled());
+        spin_lock_irq(q->queue_lock);
        __elv_add_request(q, rq, where);
        __blk_run_queue(q);
        /* the queue is stopped so it won't be run */
-        if (is_pm_resume)
+        if (rq->cmd_type == REQ_TYPE_PM_RESUME)
-                __blk_run_queue_uncond(q);
+                q->request_fn(q);
        spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index da2a818c3a9..129b9e209a3 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -24,7 +24,6 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/scatterlist.h>
-#include <linux/export.h>
 #include <linux/slab.h>
 #include "blk.h"
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fab4cdd3f7b..6f9bbd97865 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,185 +16,52 @@
 */
 static struct kmem_cache *iocontext_cachep;
-/**
+static void cfq_dtor(struct io_context *ioc)
- * get_io_context - increment reference count to io_context
- * @ioc: io_context to get
- *
- * Increment reference count to @ioc.
- */
-void get_io_context(struct io_context *ioc)
-{
-        BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
-        atomic_long_inc(&ioc->refcount);
-}
-EXPORT_SYMBOL(get_io_context);
-static void icq_free_icq_rcu(struct rcu_head *head)
-{
-        struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
-        kmem_cache_free(icq->__rcu_icq_cache, icq);
-}
-/* Exit an icq. Called with both ioc and q locked. */
-static void ioc_exit_icq(struct io_cq *icq)
-{
-        struct elevator_type *et = icq->q->elevator->type;
-        if (icq->flags & ICQ_EXITED)
-                return;
-        if (et->ops.elevator_exit_icq_fn)
-                et->ops.elevator_exit_icq_fn(icq);
-        icq->flags |= ICQ_EXITED;
-}
-/* Release an icq.  Called with both ioc and q locked. */
-static void ioc_destroy_icq(struct io_cq *icq)
-{
-        struct io_context *ioc = icq->ioc;
-        struct request_queue *q = icq->q;
-        struct elevator_type *et = q->elevator->type;
-        lockdep_assert_held(&ioc->lock);
-        lockdep_assert_held(q->queue_lock);
-        radix_tree_delete(&ioc->icq_tree, icq->q->id);
-        hlist_del_init(&icq->ioc_node);
-        list_del_init(&icq->q_node);
-        /*
-         * Both setting lookup hint to and clearing it from @icq are done
-         * under queue_lock.  If it's not pointing to @icq now, it never
-         * will.  Hint assignment itself can race safely.
-         */
-        if (rcu_dereference_raw(ioc->icq_hint) == icq)
-                rcu_assign_pointer(ioc->icq_hint, NULL);
-        ioc_exit_icq(icq);
-        /*
-         * @icq->q might have gone away by the time RCU callback runs
-         * making it impossible to determine icq_cache.  Record it in @icq.
-         */
-        icq->__rcu_icq_cache = et->icq_cache;
-        call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
-}
-/*
- * Slow path for ioc release in put_io_context().  Performs double-lock
- * dancing to unlink all icq's and then frees ioc.
- */
-static void ioc_release_fn(struct work_struct *work)
 {
-        struct io_context *ioc = container_of(work, struct io_context,
+        if (!hlist_empty(&ioc->cic_list)) {
-                                              release_work);
+                struct cfq_io_context *cic;
-        unsigned long flags;
-        /*
+                cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-         * Exiting icq may call into put_io_context() through elevator
+                                                                cic_list);
-         * which will trigger lockdep warning.  The ioc's are guaranteed to
+                cic->dtor(ioc);
-         * be different, use a different locking subclass here.  Use
-         * irqsave variant as there's no spin_lock_irq_nested().
-         */
-        spin_lock_irqsave_nested(&ioc->lock, flags, 1);
-        while (!hlist_empty(&ioc->icq_list)) {
-                struct io_cq *icq = hlist_entry(ioc->icq_list.first,
-                                                struct io_cq, ioc_node);
-                struct request_queue *q = icq->q;
-                if (spin_trylock(q->queue_lock)) {
-                        ioc_destroy_icq(icq);
-                        spin_unlock(q->queue_lock);
-                } else {
-                        spin_unlock_irqrestore(&ioc->lock, flags);
-                        cpu_relax();
-                        spin_lock_irqsave_nested(&ioc->lock, flags, 1);
-                }
        }
-        spin_unlock_irqrestore(&ioc->lock, flags);
-        kmem_cache_free(iocontext_cachep, ioc);
 }
-/**
+/*
- * put_io_context - put a reference of io_context
+ * IO Context helper functions. put_io_context() returns 1 if there are no
- * @ioc: io_context to put
+ * more users of this io context, 0 otherwise.
- *
- * Decrement reference count of @ioc and release it if the count reaches
- * zero.
 */
-void put_io_context(struct io_context *ioc)
+int put_io_context(struct io_context *ioc)
 {
-        unsigned long flags;
-        bool free_ioc = false;
        if (ioc == NULL)
-                return;
+                return 1;
-        BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+        BUG_ON(atomic_long_read(&ioc->refcount) == 0);
-        /*
-         * Releasing ioc requires reverse order double locking and we may
-         * already be holding a queue_lock.  Do it asynchronously from wq.
-         */
        if (atomic_long_dec_and_test(&ioc->refcount)) {
-                spin_lock_irqsave(&ioc->lock, flags);
+                rcu_read_lock();
-                if (!hlist_empty(&ioc->icq_list))
+                cfq_dtor(ioc);
-                        schedule_work(&ioc->release_work);
+                rcu_read_unlock();
-                else
-                        free_ioc = true;
-                spin_unlock_irqrestore(&ioc->lock, flags);
-        }
-        if (free_ioc)
                kmem_cache_free(iocontext_cachep, ioc);
+                return 1;
+        }
+        return 0;
 }
 EXPORT_SYMBOL(put_io_context);
-/**
+static void cfq_exit(struct io_context *ioc)
- * put_io_context_active - put active reference on ioc
- * @ioc: ioc of interest
- *
- * Undo get_io_context_active().  If active reference reaches zero after
- * put, @ioc can never issue further IOs and ioscheds are notified.
- */
-void put_io_context_active(struct io_context *ioc)
 {
-        struct hlist_node *n;
+        rcu_read_lock();
-        unsigned long flags;
-        struct io_cq *icq;
-        if (!atomic_dec_and_test(&ioc->active_ref)) {
+        if (!hlist_empty(&ioc->cic_list)) {
-                put_io_context(ioc);
+                struct cfq_io_context *cic;
-                return;
-        }
-        /*
+                cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-         * Need ioc lock to walk icq_list and q lock to exit icq.  Perform
+                                                                cic_list);
-         * reverse double locking.  Read comment in ioc_release_fn() for
+                cic->exit(ioc);
-         * explanation on the nested locking annotation.
-         */
-retry:
-        spin_lock_irqsave_nested(&ioc->lock, flags, 1);
-        hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) {
-                if (icq->flags & ICQ_EXITED)
-                        continue;
-                if (spin_trylock(icq->q->queue_lock)) {
-                        ioc_exit_icq(icq);
-                        spin_unlock(icq->q->queue_lock);
-                } else {
-                        spin_unlock_irqrestore(&ioc->lock, flags);
-                        cpu_relax();
-                        goto retry;
-                }
        }
-        spin_unlock_irqrestore(&ioc->lock, flags);
+        rcu_read_unlock();
-        put_io_context(ioc);
 }
 /* Called by the exiting task */
@@ -207,197 +74,86 @@ void exit_io_context(struct task_struct *task)
        task->io_context = NULL;
        task_unlock(task);
-        atomic_dec(&ioc->nr_tasks);
+        if (atomic_dec_and_test(&ioc->nr_tasks))
-        put_io_context_active(ioc);
+                cfq_exit(ioc);
-}
-/**
+        put_io_context(ioc);
- * ioc_clear_queue - break any ioc association with the specified queue
- * @q: request_queue being cleared
- *
- * Walk @q->icq_list and exit all io_cq's.  Must be called with @q locked.
- */
-void ioc_clear_queue(struct request_queue *q)
-{
-        lockdep_assert_held(q->queue_lock);
-        while (!list_empty(&q->icq_list)) {
-                struct io_cq *icq = list_entry(q->icq_list.next,
-                                               struct io_cq, q_node);
-                struct io_context *ioc = icq->ioc;
-                spin_lock(&ioc->lock);
-                ioc_destroy_icq(icq);
-                spin_unlock(&ioc->lock);
-        }
 }
-int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
        struct io_context *ioc;
-        int ret;
-        ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
-                                    node);
-        if (unlikely(!ioc))
-                return -ENOMEM;
-        /* initialize */
-        atomic_long_set(&ioc->refcount, 1);
-        atomic_set(&ioc->nr_tasks, 1);
-        atomic_set(&ioc->active_ref, 1);
-        spin_lock_init(&ioc->lock);
-        INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
-        INIT_HLIST_HEAD(&ioc->icq_list);
-        INIT_WORK(&ioc->release_work, ioc_release_fn);
-        /*
+        ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-         * Try to install.  ioc shouldn't be installed if someone else
+        if (ioc) {
-         * already did or @task, which isn't %current, is exiting.  Note
+                atomic_long_set(&ioc->refcount, 1);
-         * that we need to allow ioc creation on exiting %current as exit
+                atomic_set(&ioc->nr_tasks, 1);
-         * path may issue IOs from e.g. exit_files().  The exit path is
+                spin_lock_init(&ioc->lock);
-         * responsible for not issuing IO after exit_io_context().
+                ioc->ioprio_changed = 0;
-         */
+                ioc->ioprio = 0;
-        task_lock(task);
+                ioc->last_waited = 0; /* doesn't matter... */
-        if (!task->io_context &&
+                ioc->nr_batch_requests = 0; /* because this is 0 */
-            (task == current || !(task->flags & PF_EXITING)))
+                INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
-                task->io_context = ioc;
+                INIT_HLIST_HEAD(&ioc->cic_list);
-        else
+                ioc->ioc_data = NULL;
-                kmem_cache_free(iocontext_cachep, ioc);
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+                ioc->cgroup_changed = 0;
-        ret = task->io_context ? 0 : -EBUSY;
+#endif
+        }
-        task_unlock(task);
-        return ret;
+        return ioc;
 }
-/**
+/*
- * get_task_io_context - get io_context of a task
+ * If the current task has no IO context then create one and initialise it.
- * @task: task of interest
+ * Otherwise, return its existing IO context.
- * @gfp_flags: allocation flags, used if allocation is necessary
- * @node: allocation node, used if allocation is necessary
- *
- * Return io_context of @task.  If it doesn't exist, it is created with
- * @gfp_flags and @node.  The returned io_context has its reference count
- * incremented.
 *
- * This function always goes through task_lock() and it's better to use
+ * This returned IO context doesn't have a specifically elevated refcount,
- * %current->io_context + get_io_context() for %current.
+ * but since the current task itself holds a reference, the context can be
+ * used in general code, so long as it stays within `current` context.
 */
-struct io_context *get_task_io_context(struct task_struct *task,
+struct io_context *current_io_context(gfp_t gfp_flags, int node)
-                                       gfp_t gfp_flags, int node)
 {
-        struct io_context *ioc;
+        struct task_struct *tsk = current;
+        struct io_context *ret;
-        might_sleep_if(gfp_flags & __GFP_WAIT);
+        ret = tsk->io_context;
-        do {
+        if (likely(ret))
-                task_lock(task);
+                return ret;
-                ioc = task->io_context;
-                if (likely(ioc)) {
+        ret = alloc_io_context(gfp_flags, node);
-                        get_io_context(ioc);
+        if (ret) {
-                        task_unlock(task);
+                /* make sure set_task_ioprio() sees the settings above */
-                        return ioc;
+                smp_wmb();
-                }
+                tsk->io_context = ret;
-                task_unlock(task);
+        }
-        } while (!create_task_io_context(task, gfp_flags, node));
-        return NULL;
+        return ret;
 }
-EXPORT_SYMBOL(get_task_io_context);
-/**
+/*
- * ioc_lookup_icq - lookup io_cq from ioc
+ * If the current task has no IO context then create one and initialise it.
- * @ioc: the associated io_context
+ * If it does have a context, take a ref on it.
- * @q: the associated request_queue
 *
- * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
+ * This is always called in the context of the task which submitted the I/O.
- * with @q->queue_lock held.
 */
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
+struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
-        struct io_cq *icq;
+        struct io_context *ioc = NULL;
-        lockdep_assert_held(q->queue_lock);
        /*
-         * icq's are indexed from @ioc using radix tree and hint pointer,
+         * Check for unlikely race with exiting task. ioc ref count is
-         * both of which are protected with RCU.  All removals are done
+         * zero when ioc is being detached.
-         * holding both q and ioc locks, and we're holding q lock - if we
-         * find a icq which points to us, it's guaranteed to be valid.
         */
-        rcu_read_lock();
+        do {
-        icq = rcu_dereference(ioc->icq_hint);
+                ioc = current_io_context(gfp_flags, node);
-        if (icq && icq->q == q)
+                if (unlikely(!ioc))
-                goto out;
+                        break;
+        } while (!atomic_long_inc_not_zero(&ioc->refcount));
-        icq = radix_tree_lookup(&ioc->icq_tree, q->id);
-        if (icq && icq->q == q)
-                rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
-        else
-                icq = NULL;
-out:
-        rcu_read_unlock();
-        return icq;
-}
-EXPORT_SYMBOL(ioc_lookup_icq);
-/**
- * ioc_create_icq - create and link io_cq
- * @ioc: io_context of interest
- * @q: request_queue of interest
- * @gfp_mask: allocation mask
- *
- * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
- * will be created using @gfp_mask.
- *
- * The caller is responsible for ensuring @ioc won't go away and @q is
- * alive and will stay alive until this function returns.
- */
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
-                             gfp_t gfp_mask)
-{
-        struct elevator_type *et = q->elevator->type;
-        struct io_cq *icq;
-        /* allocate stuff */
-        icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
-                                    q->node);
-        if (!icq)
-                return NULL;
-        if (radix_tree_preload(gfp_mask) < 0) {
-                kmem_cache_free(et->icq_cache, icq);
-                return NULL;
-        }
-        icq->ioc = ioc;
-        icq->q = q;
-        INIT_LIST_HEAD(&icq->q_node);
-        INIT_HLIST_NODE(&icq->ioc_node);
-        /* lock both q and ioc and try to link @icq */
-        spin_lock_irq(q->queue_lock);
-        spin_lock(&ioc->lock);
-        if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
-                hlist_add_head(&icq->ioc_node, &ioc->icq_list);
-                list_add(&icq->q_node, &q->icq_list);
-                if (et->ops.elevator_init_icq_fn)
-                        et->ops.elevator_init_icq_fn(icq);
-        } else {
-                kmem_cache_free(et->icq_cache, icq);
-                icq = ioc_lookup_icq(ioc, q);
-                if (!icq)
-                        printk(KERN_ERR "cfq: icq link failed!\n");
-        }
-        spin_unlock(&ioc->lock);
+        return ioc;
-        spin_unlock_irq(q->queue_lock);
-        radix_tree_preload_end();
-        return icq;
 }
+EXPORT_SYMBOL(get_io_context);
 static int __init blk_ioc_init(void)
 {
diff --git a/block/blk-lib.c b/block/blk-lib.c
index b3a1f2b70b3..2b461b496a7 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -43,12 +43,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        DECLARE_COMPLETION_ONSTACK(wait);
        struct request_queue *q = bdev_get_queue(bdev);
        int type = REQ_WRITE | REQ_DISCARD;
-        sector_t max_discard_sectors;
+        unsigned int max_discard_sectors;
-        sector_t granularity, alignment;
        struct bio_batch bb;
        struct bio *bio;
        int ret = 0;
-        struct blk_plug plug;
        if (!q)
                return -ENXIO;
@@ -56,21 +54,18 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        if (!blk_queue_discard(q))
                return -EOPNOTSUPP;
-        /* Zero-sector (unknown) and one-sector granularities are the same.  */
-        granularity = max(q->limits.discard_granularity >> 9, 1U);
-        alignment = bdev_discard_alignment(bdev) >> 9;
-        alignment = sector_div(alignment, granularity);
        /*
         * Ensure that max_discard_sectors is of the proper
-         * granularity, so that requests stay aligned after a split.
+         * granularity
         */
        max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-        sector_div(max_discard_sectors, granularity);
-        max_discard_sectors *= granularity;
        if (unlikely(!max_discard_sectors)) {
                /* Avoid infinite loop below. Being cautious never hurts. */
                return -EOPNOTSUPP;
+        } else if (q->limits.discard_granularity) {
+                unsigned int disc_sects = q->limits.discard_granularity >> 9;
+                max_discard_sectors &= ~(disc_sects - 1);
        }
        if (flags & BLKDEV_DISCARD_SECURE) {
@@ -83,119 +78,29 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        bb.flags = 1 << BIO_UPTODATE;
        bb.wait = &wait;
-        blk_start_plug(&plug);
        while (nr_sects) {
-                unsigned int req_sects;
-                sector_t end_sect, tmp;
                bio = bio_alloc(gfp_mask, 1);
                if (!bio) {
                        ret = -ENOMEM;
                        break;
                }
-                req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
-                /*
-                 * If splitting a request, and the next starting sector would be
-                 * misaligned, stop the discard at the previous aligned sector.
-                 */
-                end_sect = sector + req_sects;
-                tmp = end_sect;
-                if (req_sects < nr_sects &&
-                    sector_div(tmp, granularity) != alignment) {
-                        end_sect = end_sect - alignment;
-                        sector_div(end_sect, granularity);
-                        end_sect = end_sect * granularity + alignment;
-                        req_sects = end_sect - sector;
-                }
                bio->bi_sector = sector;
                bio->bi_end_io = bio_batch_end_io;
                bio->bi_bdev = bdev;
                bio->bi_private = &bb;
-                bio->bi_size = req_sects << 9;
+                if (nr_sects > max_discard_sectors) {
-                nr_sects -= req_sects;
+                        bio->bi_size = max_discard_sectors << 9;
-                sector = end_sect;
+                        nr_sects -= max_discard_sectors;
+                        sector += max_discard_sectors;
-                atomic_inc(&bb.done);
-                submit_bio(type, bio);
-        }
-        blk_finish_plug(&plug);
-        /* Wait for bios in-flight */
-        if (!atomic_dec_and_test(&bb.done))
-                wait_for_completion(&wait);
-        if (!test_bit(BIO_UPTODATE, &bb.flags))
-                ret = -EIO;
-        return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
-/**
- * blkdev_issue_write_same - queue a write same operation
- * @bdev:       target blockdev
- * @sector:     start sector
- * @nr_sects:   number of sectors to write
- * @gfp_mask:   memory allocation flags (for bio_alloc)
- * @page:       page containing data to write
- *
- * Description:
- *    Issue a write same request for the sectors in question.
- */
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-                            sector_t nr_sects, gfp_t gfp_mask,
-                            struct page *page)
-{
-        DECLARE_COMPLETION_ONSTACK(wait);
-        struct request_queue *q = bdev_get_queue(bdev);
-        unsigned int max_write_same_sectors;
-        struct bio_batch bb;
-        struct bio *bio;
-        int ret = 0;
-        if (!q)
-                return -ENXIO;
-        max_write_same_sectors = q->limits.max_write_same_sectors;
-        if (max_write_same_sectors == 0)
-                return -EOPNOTSUPP;
-        atomic_set(&bb.done, 1);
-        bb.flags = 1 << BIO_UPTODATE;
-        bb.wait = &wait;
-        while (nr_sects) {
-                bio = bio_alloc(gfp_mask, 1);
-                if (!bio) {
-                        ret = -ENOMEM;
-                        break;
-                }
-                bio->bi_sector = sector;
-                bio->bi_end_io = bio_batch_end_io;
-                bio->bi_bdev = bdev;
-                bio->bi_private = &bb;
-                bio->bi_vcnt = 1;
-                bio->bi_io_vec->bv_page = page;
-                bio->bi_io_vec->bv_offset = 0;
-                bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
-                if (nr_sects > max_write_same_sectors) {
-                        bio->bi_size = max_write_same_sectors << 9;
-                        nr_sects -= max_write_same_sectors;
-                        sector += max_write_same_sectors;
                } else {
                        bio->bi_size = nr_sects << 9;
                        nr_sects = 0;
                }
                atomic_inc(&bb.done);
-                submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio);
+                submit_bio(type, bio);
        }
        /* Wait for bios in-flight */
@@ -203,11 +108,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
                wait_for_completion(&wait);
        if (!test_bit(BIO_UPTODATE, &bb.flags))
-                ret = -ENOTSUPP;
+                ret = -EIO;
        return ret;
 }
-EXPORT_SYMBOL(blkdev_issue_write_same);
+EXPORT_SYMBOL(blkdev_issue_discard);
 /**
 * blkdev_issue_zeroout - generate number of zero filed write bios
@@ -220,7 +125,7 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
 *  Generate and issue number of bios with zerofiled pages.
 */
-int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                        sector_t nr_sects, gfp_t gfp_mask)
 {
        int ret;
@@ -270,32 +175,4 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
        return ret;
 }
-/**
- * blkdev_issue_zeroout - zero-fill a block range
- * @bdev:       blockdev to write
- * @sector:     start sector
- * @nr_sects:   number of sectors to write
- * @gfp_mask:   memory allocation flags (for bio_alloc)
- *
- * Description:
- *  Generate and issue number of bios with zerofiled pages.
- */
-int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-                         sector_t nr_sects, gfp_t gfp_mask)
-{
-        if (bdev_write_same(bdev)) {
-                unsigned char bdn[BDEVNAME_SIZE];
-                if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
-                                             ZERO_PAGE(0)))
-                        return 0;
-                bdevname(bdev, bdn);
-                pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
-        }
-        return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
-}
 EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index 623e1cd4cff..164cd005970 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
        if (IS_ERR(bio))
                return PTR_ERR(bio);
-        if (!reading)
+        if (rq_data_dir(rq) == WRITE)
                bio->bi_rw |= REQ_WRITE;
        if (do_copy)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 936a110de0b..cfcc37cb222 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -110,49 +110,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
        return 0;
 }
-static void
-__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
-                     struct scatterlist *sglist, struct bio_vec **bvprv,
-                     struct scatterlist **sg, int *nsegs, int *cluster)
-{
-        int nbytes = bvec->bv_len;
-        if (*bvprv && *cluster) {
-                if ((*sg)->length + nbytes > queue_max_segment_size(q))
-                        goto new_segment;
-                if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
-                        goto new_segment;
-                if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
-                        goto new_segment;
-                (*sg)->length += nbytes;
-        } else {
-new_segment:
-                if (!*sg)
-                        *sg = sglist;
-                else {
-                        /*
-                         * If the driver previously mapped a shorter
-                         * list, we could see a termination bit
-                         * prematurely unless it fully inits the sg
-                         * table on each mapping. We KNOW that there
-                         * must be more entries here or the driver
-                         * would be buggy, so force clear the
-                         * termination bit to avoid doing a full
-                         * sg_init_table() in drivers for each command.
-                         */
-                        (*sg)->page_link &= ~0x02;
-                        *sg = sg_next(*sg);
-                }
-                sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
-                (*nsegs)++;
-        }
-        *bvprv = bvec;
-}
 /*
 * map a request to scatterlist, return number of sg entries setup. Caller
 * must make sure sg can hold rq->nr_phys_segments entries
@@ -174,8 +131,41 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
        bvprv = NULL;
        sg = NULL;
        rq_for_each_segment(bvec, rq, iter) {
-                __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+                int nbytes = bvec->bv_len;
-                                     &nsegs, &cluster);
+                if (bvprv && cluster) {
+                        if (sg->length + nbytes > queue_max_segment_size(q))
+                                goto new_segment;
+                        if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
+                                goto new_segment;
+                        if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
+                                goto new_segment;
+                        sg->length += nbytes;
+                } else {
+new_segment:
+                        if (!sg)
+                                sg = sglist;
+                        else {
+                                /*
+                                 * If the driver previously mapped a shorter
+                                 * list, we could see a termination bit
+                                 * prematurely unless it fully inits the sg
+                                 * table on each mapping. We KNOW that there
+                                 * must be more entries here or the driver
+                                 * would be buggy, so force clear the
+                                 * termination bit to avoid doing a full
+                                 * sg_init_table() in drivers for each command.
+                                 */
+                                sg->page_link &= ~0x02;
+                                sg = sg_next(sg);
+                        }
+                        sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
+                        nsegs++;
+                }
+                bvprv = bvec;
        } /* segments in rq */
@@ -209,43 +199,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
-/**
- * blk_bio_map_sg - map a bio to a scatterlist
- * @q: request_queue in question
- * @bio: bio being mapped
- * @sglist: scatterlist being mapped
- *
- * Note:
- *    Caller must make sure sg can hold bio->bi_phys_segments entries
- *
- * Will return the number of sg entries setup
- */
-int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
-                   struct scatterlist *sglist)
-{
-        struct bio_vec *bvec, *bvprv;
-        struct scatterlist *sg;
-        int nsegs, cluster;
-        unsigned long i;
-        nsegs = 0;
-        cluster = blk_queue_cluster(q);
-        bvprv = NULL;
-        sg = NULL;
-        bio_for_each_segment(bvec, bio, i) {
-                __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
-                                     &nsegs, &cluster);
-        } /* segments in bio */
-        if (sg)
-                sg_mark_end(sg);
-        BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
-        return nsegs;
-}
-EXPORT_SYMBOL(blk_bio_map_sg);
 static inline int ll_new_hw_segment(struct request_queue *q,
                                    struct request *req,
                                    struct bio *bio)
@@ -275,8 +228,14 @@ no_merge:
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
                     struct bio *bio)
 {
-        if (blk_rq_sectors(req) + bio_sectors(bio) >
+        unsigned short max_sectors;
-            blk_rq_get_max_sectors(req)) {
+        if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
+                max_sectors = queue_max_hw_sectors(q);
+        else
+                max_sectors = queue_max_sectors(q);
+        if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
                        q->last_merge = NULL;
@@ -293,8 +252,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
                      struct bio *bio)
 {
-        if (blk_rq_sectors(req) + bio_sectors(bio) >
+        unsigned short max_sectors;
-            blk_rq_get_max_sectors(req)) {
+        if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
+                max_sectors = queue_max_hw_sectors(q);
+        else
+                max_sectors = queue_max_sectors(q);
+        if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
                        q->last_merge = NULL;
@@ -325,8 +291,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
        /*
         * Will it become too large?
         */
-        if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
+        if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q))
-            blk_rq_get_max_sectors(req))
                return 0;
        total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -405,7 +370,16 @@ static int attempt_merge(struct request_queue *q, struct request *req,
        if (!rq_mergeable(req) || !rq_mergeable(next))
                return 0;
-        if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags))
+        /*
+         * Don't merge file system requests and discard requests
+         */
+        if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD))
+                return 0;
+        /*
+         * Don't merge discard requests and secure discard requests
+         */
+        if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE))
                return 0;
        /*
@@ -419,10 +393,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
            || next->special)
                return 0;
-        if (req->cmd_flags & REQ_WRITE_SAME &&
-            !blk_write_same_mergeable(req->bio, next->bio))
-                return 0;
        /*
         * If we are allowed to merge, then append bio list
         * from next to rq and release next. merge_requests_fn
@@ -501,40 +471,3 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 {
        return attempt_merge(q, rq, next);
 }
-bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
-{
-        if (!rq_mergeable(rq) || !bio_mergeable(bio))
-                return false;
-        if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw))
-                return false;
-        /* different data direction or already started, don't merge */
-        if (bio_data_dir(bio) != rq_data_dir(rq))
-                return false;
-        /* must be same device and not a special request */
-        if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
-                return false;
-        /* only merge integrity protected bio into ditto rq */
-        if (bio_integrity(bio) != blk_integrity_rq(rq))
-                return false;
-        /* must be using the same buffer */
-        if (rq->cmd_flags & REQ_WRITE_SAME &&
-            !blk_write_same_mergeable(rq->bio, bio))
-                return false;
-        return true;
-}
-int blk_try_merge(struct request *rq, struct bio *bio)
-{
-        if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector)
-                return ELEVATOR_BACK_MERGE;
-        else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector)
-                return ELEVATOR_FRONT_MERGE;
-        return ELEVATOR_NO_MERGE;
-}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c50ecf0ea3b..fa1eb0449a0 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -104,7 +104,9 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 * @lim:  the queue_limits structure to reset
 *
 * Description:
- *   Returns a queue_limit struct to its default state.
+ *   Returns a queue_limit struct to its default state.  Can be used by
+ *   stacking drivers like DM that stage table swaps and reuse an
+ *   existing device queue.
 */
 void blk_set_default_limits(struct queue_limits *lim)
 {
@@ -112,13 +114,13 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->max_integrity_segments = 0;
        lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
        lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
-        lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+        lim->max_sectors = BLK_DEF_MAX_SECTORS;
-        lim->max_write_same_sectors = 0;
+        lim->max_hw_sectors = INT_MAX;
        lim->max_discard_sectors = 0;
        lim->discard_granularity = 0;
        lim->discard_alignment = 0;
        lim->discard_misaligned = 0;
-        lim->discard_zeroes_data = 0;
+        lim->discard_zeroes_data = 1;
        lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
        lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
        lim->alignment_offset = 0;
@@ -129,27 +131,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 EXPORT_SYMBOL(blk_set_default_limits);
 /**
- * blk_set_stacking_limits - set default limits for stacking devices
- * @lim:  the queue_limits structure to reset
- *
- * Description:
- *   Returns a queue_limit struct to its default state. Should be used
- *   by stacking drivers like DM that have no internal limits.
- */
-void blk_set_stacking_limits(struct queue_limits *lim)
-{
-        blk_set_default_limits(lim);
-        /* Inherit limits from component devices */
-        lim->discard_zeroes_data = 1;
-        lim->max_segments = USHRT_MAX;
-        lim->max_hw_sectors = UINT_MAX;
-        lim->max_sectors = UINT_MAX;
-        lim->max_write_same_sectors = UINT_MAX;
-}
-EXPORT_SYMBOL(blk_set_stacking_limits);
-/**
 * blk_queue_make_request - define an alternate make_request function for a device
 * @q:  the request queue for the device to be affected
 * @mfn: the alternate make_request function
@@ -184,6 +165,8 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
        q->nr_batching = BLK_BATCH_REQ;
        blk_set_default_limits(&q->limits);
+        blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+        q->limits.discard_zeroes_data = 0;
        /*
         * by default assume old behaviour and bounce for any highmem page
@@ -288,18 +271,6 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 /**
- * blk_queue_max_write_same_sectors - set max sectors for a single write same
- * @q:  the request queue for the device
- * @max_write_same_sectors: maximum number of sectors to write per command
- **/
-void blk_queue_max_write_same_sectors(struct request_queue *q,
-                                      unsigned int max_write_same_sectors)
-{
-        q->limits.max_write_same_sectors = max_write_same_sectors;
-}
-EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
-/**
 * blk_queue_max_segments - set max hw segments for a request for this queue
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
@@ -524,8 +495,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
        t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
-        t->max_write_same_sectors = min(t->max_write_same_sectors,
-                                        b->max_write_same_sectors);
        t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
        t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -611,7 +580,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                        bottom = b->discard_granularity + alignment;
                        /* Verify that top and bottom intervals line up */
-                        if ((max(top, bottom) % min(top, bottom)) != 0)
+                        if (max(top, bottom) & (min(top, bottom) - 1))
                                t->discard_misaligned = 1;
                }
@@ -619,8 +588,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                                                      b->max_discard_sectors);
                t->discard_granularity = max(t->discard_granularity,
                                             b->discard_granularity);
-                t->discard_alignment = lcm(t->discard_alignment, alignment) %
+                t->discard_alignment = lcm(t->discard_alignment, alignment) &
-                        t->discard_granularity;
+                        (t->discard_granularity - 1);
        }
        return ret;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 467c8de8864..1366a89d8e6 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -8,7 +8,6 @@
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
-#include <linux/sched.h>
 #include "blk.h"
@@ -104,10 +103,9 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 void __blk_complete_request(struct request *req)
 {
-        int ccpu, cpu;
+        int ccpu, cpu, group_cpu = NR_CPUS;
        struct request_queue *q = req->q;
        unsigned long flags;
-        bool shared = false;
        BUG_ON(!q->softirq_done_fn);
@@ -119,20 +117,22 @@ void __blk_complete_request(struct request *req)
         */
        if (req->cpu != -1) {
                ccpu = req->cpu;
-                if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
+                if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
-                        shared = cpus_share_cache(cpu, ccpu);
+                        ccpu = blk_cpu_to_group(ccpu);
+                        group_cpu = blk_cpu_to_group(cpu);
+                }
        } else
                ccpu = cpu;
        /*
-         * If current CPU and requested CPU share a cache, run the softirq on
+         * If current CPU and requested CPU are in the same group, running
-         * the current CPU. One might concern this is just like
+         * softirq in current CPU. One might concern this is just like
         * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
         * running in interrupt handler, and currently I/O controller doesn't
         * support multiple interrupts, so current CPU is unique actually. This
         * avoids IPI sending from current CPU to the first CPU of a group.
         */
-        if (ccpu == cpu || shared) {
+        if (ccpu == cpu || ccpu == group_cpu) {
                struct list_head *list;
 do_local:
                list = &__get_cpu_var(blk_cpu_done);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 788147797a7..60fda88c57f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -9,7 +9,6 @@
 #include <linux/blktrace_api.h>
 #include "blk.h"
-#include "blk-cgroup.h"
 struct queue_sysfs_entry {
        struct attribute attr;
@@ -26,15 +25,9 @@ queue_var_show(unsigned long var, char *page)
 static ssize_t
 queue_var_store(unsigned long *var, const char *page, size_t count)
 {
-        int err;
+        char *p = (char *) page;
-        unsigned long v;
-        err = strict_strtoul(page, 10, &v);
-        if (err || v > UINT_MAX)
-                return -EINVAL;
-        *var = v;
+        *var = simple_strtoul(p, &p, 10);
        return count;
 }
@@ -46,7 +39,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-        struct request_list *rl;
+        struct request_list *rl = &q->rq;
        unsigned long nr;
        int ret;
@@ -54,9 +47,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
                return -EINVAL;
        ret = queue_var_store(&nr, page, count);
-        if (ret < 0)
-                return ret;
        if (nr < BLKDEV_MIN_RQ)
                nr = BLKDEV_MIN_RQ;
@@ -64,9 +54,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        q->nr_requests = nr;
        blk_queue_congestion_threshold(q);
-        /* congestion isn't cgroup aware and follows root blkcg for now */
-        rl = &q->root_rl;
        if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
                blk_set_queue_congested(q, BLK_RW_SYNC);
        else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -77,22 +64,19 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
                blk_clear_queue_congested(q, BLK_RW_ASYNC);
-        blk_queue_for_each_rl(rl, q) {
+        if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+                blk_set_queue_full(q, BLK_RW_SYNC);
-                        blk_set_rl_full(rl, BLK_RW_SYNC);
+        } else {
-                } else {
+                blk_clear_queue_full(q, BLK_RW_SYNC);
-                        blk_clear_rl_full(rl, BLK_RW_SYNC);
+                wake_up(&rl->wait[BLK_RW_SYNC]);
-                        wake_up(&rl->wait[BLK_RW_SYNC]);
-                }
-                if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-                        blk_set_rl_full(rl, BLK_RW_ASYNC);
-                } else {
-                        blk_clear_rl_full(rl, BLK_RW_ASYNC);
-                        wake_up(&rl->wait[BLK_RW_ASYNC]);
-                }
        }
+        if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+                blk_set_queue_full(q, BLK_RW_ASYNC);
+        } else {
+                blk_clear_queue_full(q, BLK_RW_ASYNC);
+                wake_up(&rl->wait[BLK_RW_ASYNC]);
+        }
        spin_unlock_irq(q->queue_lock);
        return ret;
 }
@@ -111,9 +95,6 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
        unsigned long ra_kb;
        ssize_t ret = queue_var_store(&ra_kb, page, count);
-        if (ret < 0)
-                return ret;
        q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
        return ret;
@@ -180,13 +161,6 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
        return queue_var_show(queue_discard_zeroes_data(q), page);
 }
-static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
-{
-        return sprintf(page, "%llu\n",
-                (unsigned long long)q->limits.max_write_same_sectors << 9);
-}
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -195,9 +169,6 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
                        page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
        ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
-        if (ret < 0)
-                return ret;
        if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
                return -EINVAL;
@@ -258,9 +229,6 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
        unsigned long nm;
        ssize_t ret = queue_var_store(&nm, page, count);
-        if (ret < 0)
-                return ret;
        spin_lock_irq(q->queue_lock);
        queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
        queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
@@ -289,9 +257,6 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
        unsigned long val;
        ret = queue_var_store(&val, page, count);
-        if (ret < 0)
-                return ret;
        spin_lock_irq(q->queue_lock);
        if (val == 2) {
                queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
@@ -392,11 +357,6 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
        .show = queue_discard_zeroes_data_show,
 };
-static struct queue_sysfs_entry queue_write_same_max_entry = {
-        .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
-        .show = queue_write_same_max_show,
-};
 static struct queue_sysfs_entry queue_nonrot_entry = {
        .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
        .show = queue_show_nonrot,
@@ -444,7 +404,6 @@ static struct attribute *default_attrs[] = {
        &queue_discard_granularity_entry.attr,
        &queue_discard_max_entry.attr,
        &queue_discard_zeroes_data_entry.attr,
-        &queue_write_same_max_entry.attr,
        &queue_nonrot_entry.attr,
        &queue_nomerges_entry.attr,
        &queue_rq_affinity_entry.attr,
@@ -466,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
        if (!entry->show)
                return -EIO;
        mutex_lock(&q->sysfs_lock);
-        if (blk_queue_dying(q)) {
+        if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
                mutex_unlock(&q->sysfs_lock);
                return -ENOENT;
        }
@@ -488,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
        q = container_of(kobj, struct request_queue, kobj);
        mutex_lock(&q->sysfs_lock);
-        if (blk_queue_dying(q)) {
+        if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
                mutex_unlock(&q->sysfs_lock);
                return -ENOENT;
        }
@@ -498,11 +457,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 }
 /**
- * blk_release_queue: - release a &struct request_queue when it is no longer needed
+ * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
- * @kobj:    the kobj belonging to the request queue to be released
+ * @kobj:    the kobj belonging of the request queue to be released
 *
 * Description:
- *     blk_release_queue is the pair to blk_init_queue() or
+ *     blk_cleanup_queue is the pair to blk_init_queue() or
 *     blk_queue_make_request().  It should be called when a request queue is
 *     being released; typically when a block device is being de-registered.
 *     Currently, its primary task it to free all the &struct request
@@ -516,19 +475,17 @@ static void blk_release_queue(struct kobject *kobj)
 {
        struct request_queue *q =
                container_of(kobj, struct request_queue, kobj);
+        struct request_list *rl = &q->rq;
        blk_sync_queue(q);
-        blkcg_exit_queue(q);
+        if (q->elevator)
-        if (q->elevator) {
-                spin_lock_irq(q->queue_lock);
-                ioc_clear_queue(q);
-                spin_unlock_irq(q->queue_lock);
                elevator_exit(q->elevator);
-        }
-        blk_exit_rl(&q->root_rl);
+        blk_throtl_exit(q);
+        if (rl->rq_pool)
+                mempool_destroy(rl->rq_pool);
        if (q->queue_tags)
                __blk_queue_free_tags(q);
@@ -536,8 +493,6 @@ static void blk_release_queue(struct kobject *kobj)
        blk_trace_shutdown(q);
        bdi_destroy(&q->backing_dev_info);
-        ida_simple_remove(&blk_queue_ida, q->id);
        kmem_cache_free(blk_requestq_cachep, q);
 }
@@ -561,12 +516,6 @@ int blk_register_queue(struct gendisk *disk)
        if (WARN_ON(!q))
                return -ENXIO;
-        /*
-         * Initialization must be complete by now.  Finish the initial
-         * bypass from queue allocation.
-         */
-        blk_queue_bypass_end(q);
        ret = blk_trace_init_sysfs(dev);
        if (ret)
                return ret;
diff --git a/block/blk-tag.c b/block/blk-tag.c
index cc345e1d8d4..ece65fc4c79 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -186,8 +186,7 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
                tags = __blk_queue_init_tags(q, depth);
                if (!tags)
-                        return -ENOMEM;
+                        goto fail;
        } else if (q->queue_tags) {
                rc = blk_queue_resize_tags(q, depth);
                if (rc)
@@ -204,6 +203,9 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
        queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
        INIT_LIST_HEAD(&q->tag_busy_list);
        return 0;
+fail:
+        kfree(tags);
+        return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_queue_init_tags);
@@ -280,9 +282,16 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
 void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 {
        struct blk_queue_tag *bqt = q->queue_tags;
-        unsigned tag = rq->tag; /* negative tags invalid */
+        int tag = rq->tag;
+        BUG_ON(tag == -1);
-        BUG_ON(tag >= bqt->real_max_depth);
+        if (unlikely(tag >= bqt->real_max_depth))
+                /*
+                 * This can happen after tag depth has been reduced.
+                 * FIXME: how about a warning or info message here?
+                 */
+                return;
        list_del_init(&rq->queuelist);
        rq->cmd_flags &= ~REQ_QUEUED;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 31146225f3d..a19f58c6fc3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,7 +10,6 @@
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
-#include "blk.h"
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
@@ -21,8 +20,6 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;      /* 100 ms */
-static struct blkcg_policy blkcg_policy_throtl;
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -40,17 +37,9 @@ struct throtl_rb_root {
 #define rb_entry_tg(node)       rb_entry((node), struct throtl_grp, rb_node)
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-        /* total bytes transferred */
-        struct blkg_rwstat              service_bytes;
-        /* total IOs serviced, post merge */
-        struct blkg_rwstat              serviced;
-};
 struct throtl_grp {
-        /* must be the first member */
+        /* List of throtl groups on the request queue*/
-        struct blkg_policy_data pd;
+        struct hlist_node tg_node;
        /* active throtl group service_tree member */
        struct rb_node rb_node;
@@ -62,6 +51,8 @@ struct throtl_grp {
         */
        unsigned long disptime;
+        struct blkio_group blkg;
+        atomic_t ref;
        unsigned int flags;
        /* Two lists for READ and WRITE */
@@ -88,18 +79,18 @@ struct throtl_grp {
        /* Some throttle limits got updated for the group */
        int limits_changed;
-        /* Per cpu stats pointer */
+        struct rcu_head rcu_head;
-        struct tg_stats_cpu __percpu *stats_cpu;
-        /* List of tgs waiting for per cpu stats memory to be allocated */
-        struct list_head stats_alloc_node;
 };
 struct throtl_data
 {
+        /* List of throtl groups */
+        struct hlist_head tg_list;
        /* service tree for active throtl groups */
        struct throtl_rb_root tg_service_tree;
+        struct throtl_grp *root_tg;
        struct request_queue *queue;
        /* Total Number of queued bios on READ and WRITE lists */
@@ -116,33 +107,6 @@ struct throtl_data
        int limits_changed;
 };
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
-{
-        return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
-}
-static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
-{
-        return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
-}
-static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
-{
-        return pd_to_blkg(&tg->pd);
-}
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
-        return blkg_to_tg(td->queue->root_blkg);
-}
 enum tg_state_flags {
        THROTL_TG_FLAG_on_rr = 0,       /* on round-robin busy list */
 };
@@ -163,149 +127,254 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
 THROTL_TG_FNS(on_rr);
-#define throtl_log_tg(td, tg, fmt, args...)     do {                    \
+#define throtl_log_tg(td, tg, fmt, args...)                             \
-        char __pbuf[128];                                               \
+        blk_add_trace_msg((td)->queue, "throtl %s " fmt,                \
-                                                                        \
+                                blkg_path(&(tg)->blkg), ##args);        \
-        blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));              \
-        blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
-} while (0)
 #define throtl_log(td, fmt, args...)    \
        blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
+static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
+{
+        if (blkg)
+                return container_of(blkg, struct throtl_grp, blkg);
+        return NULL;
+}
 static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
        return td->nr_queued[0] + td->nr_queued[1];
 }
-/*
+static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
 {
-        static struct tg_stats_cpu *stats_cpu;  /* this fn is non-reentrant */
+        atomic_inc(&tg->ref);
-        struct delayed_work *dwork = to_delayed_work(work);
+        return tg;
-        bool empty = false;
+}
-alloc_stats:
-        if (!stats_cpu) {
-                stats_cpu = alloc_percpu(struct tg_stats_cpu);
-                if (!stats_cpu) {
-                        /* allocation failed, try again after some time */
-                        schedule_delayed_work(dwork, msecs_to_jiffies(10));
-                        return;
-                }
-        }
-        spin_lock_irq(&tg_stats_alloc_lock);
-        if (!list_empty(&tg_stats_alloc_list)) {
+static void throtl_free_tg(struct rcu_head *head)
-                struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
+{
-                                                         struct throtl_grp,
+        struct throtl_grp *tg;
-                                                         stats_alloc_node);
-                swap(tg->stats_cpu, stats_cpu);
-                list_del_init(&tg->stats_alloc_node);
-        }
-        empty = list_empty(&tg_stats_alloc_list);
+        tg = container_of(head, struct throtl_grp, rcu_head);
-        spin_unlock_irq(&tg_stats_alloc_lock);
+        free_percpu(tg->blkg.stats_cpu);
-        if (!empty)
+        kfree(tg);
-                goto alloc_stats;
 }
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_put_tg(struct throtl_grp *tg)
 {
-        struct throtl_grp *tg = blkg_to_tg(blkg);
+        BUG_ON(atomic_read(&tg->ref) <= 0);
-        unsigned long flags;
+        if (!atomic_dec_and_test(&tg->ref))
+                return;
+        /*
+         * A group is freed in rcu manner. But having an rcu lock does not
+         * mean that one can access all the fields of blkg and assume these
+         * are valid. For example, don't try to follow throtl_data and
+         * request queue links.
+         *
+         * Having a reference to blkg under an rcu allows acess to only
+         * values local to groups like group stats and group rate limits
+         */
+        call_rcu(&tg->rcu_head, throtl_free_tg);
+}
+static void throtl_init_group(struct throtl_grp *tg)
+{
+        INIT_HLIST_NODE(&tg->tg_node);
        RB_CLEAR_NODE(&tg->rb_node);
        bio_list_init(&tg->bio_lists[0]);
        bio_list_init(&tg->bio_lists[1]);
        tg->limits_changed = false;
-        tg->bps[READ] = -1;
+        /* Practically unlimited BW */
-        tg->bps[WRITE] = -1;
+        tg->bps[0] = tg->bps[1] = -1;
-        tg->iops[READ] = -1;
+        tg->iops[0] = tg->iops[1] = -1;
-        tg->iops[WRITE] = -1;
        /*
-         * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
+         * Take the initial reference that will be released on destroy
-         * but percpu allocator can't be called from IO path.  Queue tg on
+         * This can be thought of a joint reference by cgroup and
-         * tg_stats_alloc_list and allocate from work item.
+         * request queue which will be dropped by either request queue
+         * exit or cgroup deletion path depending on who is exiting first.
         */
-        spin_lock_irqsave(&tg_stats_alloc_lock, flags);
+        atomic_set(&tg->ref, 1);
-        list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
-        schedule_delayed_work(&tg_stats_alloc_work, 0);
-        spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
-static void throtl_pd_exit(struct blkcg_gq *blkg)
+/* Should be called with rcu read lock held (needed for blkcg) */
+static void
+throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
 {
-        struct throtl_grp *tg = blkg_to_tg(blkg);
+        hlist_add_head(&tg->tg_node, &td->tg_list);
-        unsigned long flags;
+        td->nr_undestroyed_grps++;
+}
-        spin_lock_irqsave(&tg_stats_alloc_lock, flags);
+static void
-        list_del_init(&tg->stats_alloc_node);
+__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
-        spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
+{
+        struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+        unsigned int major, minor;
+        if (!tg || tg->blkg.dev)
+                return;
-        free_percpu(tg->stats_cpu);
+        /*
+         * Fill in device details for a group which might not have been
+         * filled at group creation time as queue was being instantiated
+         * and driver had not attached a device yet
+         */
+        if (bdi->dev && dev_name(bdi->dev)) {
+                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                tg->blkg.dev = MKDEV(major, minor);
+        }
 }
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
+/*
+ * Should be called with without queue lock held. Here queue lock will be
+ * taken rarely. It will be taken only once during life time of a group
+ * if need be
+ */
+static void
+throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
 {
-        struct throtl_grp *tg = blkg_to_tg(blkg);
+        if (!tg || tg->blkg.dev)
-        int cpu;
-        if (tg->stats_cpu == NULL)
                return;
-        for_each_possible_cpu(cpu) {
+        spin_lock_irq(td->queue->queue_lock);
-                struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
+        __throtl_tg_fill_dev_details(td, tg);
+        spin_unlock_irq(td->queue->queue_lock);
+}
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+                        struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+        __throtl_tg_fill_dev_details(td, tg);
+        /* Add group onto cgroup list */
+        blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+                                tg->blkg.dev, BLKIO_POLICY_THROTL);
-                blkg_rwstat_reset(&sc->service_bytes);
+        tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
-                blkg_rwstat_reset(&sc->serviced);
+        tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+        tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+        tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
+        throtl_add_group_to_td_list(td, tg);
+}
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+        struct throtl_grp *tg = NULL;
+        int ret;
+        tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+        if (!tg)
+                return NULL;
+        ret = blkio_alloc_blkg_stats(&tg->blkg);
+        if (ret) {
+                kfree(tg);
+                return NULL;
        }
+        throtl_init_group(tg);
+        return tg;
 }
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
+static struct
-                                           struct blkcg *blkcg)
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
+        struct throtl_grp *tg = NULL;
+        void *key = td;
        /*
-         * This is the common case when there are no blkcgs.  Avoid lookup
+         * This is the common case when there are no blkio cgroups.
-         * in this case
+         * Avoid lookup in this case
-         */
+         */
-        if (blkcg == &blkcg_root)
+        if (blkcg == &blkio_root_cgroup)
-                return td_root_tg(td);
+                tg = td->root_tg;
+        else
+                tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
-        return blkg_to_tg(blkg_lookup(blkcg, td->queue));
+        __throtl_tg_fill_dev_details(td, tg);
+        return tg;
 }
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
+/*
-                                                  struct blkcg *blkcg)
+ * This function returns with queue lock unlocked in case of error, like
+ * request queue is no more
+ */
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
+        struct throtl_grp *tg = NULL, *__tg = NULL;
+        struct blkio_cgroup *blkcg;
        struct request_queue *q = td->queue;
-        struct throtl_grp *tg = NULL;
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        tg = throtl_find_tg(td, blkcg);
+        if (tg) {
+                rcu_read_unlock();
+                return tg;
+        }
+        /*
+         * Need to allocate a group. Allocation of group also needs allocation
+         * of per cpu stats which in-turn takes a mutex() and can block. Hence
+         * we need to drop rcu lock and queue_lock before we call alloc
+         *
+         * Take the request queue reference to make sure queue does not
+         * go away once we return from allocation.
+         */
+        blk_get_queue(q);
+        rcu_read_unlock();
+        spin_unlock_irq(q->queue_lock);
+        tg = throtl_alloc_tg(td);
        /*
-         * This is the common case when there are no blkcgs.  Avoid lookup
+         * We might have slept in group allocation. Make sure queue is not
-         * in this case
+         * dead
         */
-        if (blkcg == &blkcg_root) {
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-                tg = td_root_tg(td);
+                blk_put_queue(q);
-        } else {
+                if (tg)
-                struct blkcg_gq *blkg;
+                        kfree(tg);
-                blkg = blkg_lookup_create(blkcg, q);
+                return ERR_PTR(-ENODEV);
-                /* if %NULL and @q is alive, fall back to root_tg */
-                if (!IS_ERR(blkg))
-                        tg = blkg_to_tg(blkg);
-                else if (!blk_queue_dying(q))
-                        tg = td_root_tg(td);
        }
+        blk_put_queue(q);
+        /* Group allocated and queue is still alive. take the lock */
+        spin_lock_irq(q->queue_lock);
+        /*
+         * Initialize the new group. After sleeping, read the blkcg again.
+         */
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        /*
+         * If some other thread already allocated the group while we were
+         * not holding queue lock, free up the group
+         */
+        __tg = throtl_find_tg(td, blkcg);
+        if (__tg) {
+                kfree(tg);
+                rcu_read_unlock();
+                return __tg;
+        }
+        /* Group allocation failed. Account the IO to root group */
+        if (!tg) {
+                tg = td->root_tg;
+                return tg;
+        }
+        throtl_init_add_tg_lists(td, tg, blkcg);
+        rcu_read_unlock();
        return tg;
 }
@@ -674,41 +743,16 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
        return 0;
 }
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
-                                         int rw)
-{
-        struct throtl_grp *tg = blkg_to_tg(blkg);
-        struct tg_stats_cpu *stats_cpu;
-        unsigned long flags;
-        /* If per cpu stats are not allocated yet, don't do any accounting. */
-        if (tg->stats_cpu == NULL)
-                return;
-        /*
-         * Disabling interrupts to provide mutual exclusion between two
-         * writes on same cpu. It probably is not needed for 64bit. Not
-         * optimizing that case yet.
-         */
-        local_irq_save(flags);
-        stats_cpu = this_cpu_ptr(tg->stats_cpu);
-        blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-        blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-        local_irq_restore(flags);
-}
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
        bool rw = bio_data_dir(bio);
+        bool sync = rw_is_sync(bio->bi_rw);
        /* Charge the bio to the group */
        tg->bytes_disp[rw] += bio->bi_size;
        tg->io_disp[rw]++;
-        throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
 }
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -718,7 +762,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
        bio_list_add(&tg->bio_lists[rw], bio);
        /* Take a bio reference on tg */
-        blkg_get(tg_to_blkg(tg));
+        throtl_ref_get_tg(tg);
        tg->nr_queued[rw]++;
        td->nr_queued[rw]++;
        throtl_enqueue_tg(td, tg);
@@ -751,8 +795,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
        bio = bio_list_pop(&tg->bio_lists[rw]);
        tg->nr_queued[rw]--;
-        /* Drop bio reference on blkg */
+        /* Drop bio reference on tg */
-        blkg_put(tg_to_blkg(tg));
+        throtl_put_tg(tg);
        BUG_ON(td->nr_queued[rw] <= 0);
        td->nr_queued[rw]--;
@@ -830,8 +874,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 static void throtl_process_limit_change(struct throtl_data *td)
 {
-        struct request_queue *q = td->queue;
+        struct throtl_grp *tg;
-        struct blkcg_gq *blkg, *n;
+        struct hlist_node *pos, *n;
        if (!td->limits_changed)
                return;
@@ -840,9 +884,7 @@ static void throtl_process_limit_change(struct throtl_data *td)
        throtl_log(td, "limits changed");
-        list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+        hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-                struct throtl_grp *tg = blkg_to_tg(blkg);
                if (!tg->limits_changed)
                        continue;
@@ -929,164 +971,135 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
        /* schedule work if limits changed even if no bio is queued */
        if (total_nr_queued(td) || td->limits_changed) {
-                mod_delayed_work(kthrotld_workqueue, dwork, delay);
+                /*
+                 * We might have a work scheduled to be executed in future.
+                 * Cancel that and schedule a new one.
+                 */
+                __cancel_delayed_work(dwork);
+                queue_delayed_work(kthrotld_workqueue, dwork, delay);
                throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
                                delay, jiffies);
        }
 }
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
+static void
-                                struct blkg_policy_data *pd, int off)
+throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
-        struct throtl_grp *tg = pd_to_tg(pd);
+        /* Something wrong if we are trying to remove same group twice */
-        struct blkg_rwstat rwstat = { }, tmp;
+        BUG_ON(hlist_unhashed(&tg->tg_node));
-        int i, cpu;
-        for_each_possible_cpu(cpu) {
+        hlist_del_init(&tg->tg_node);
-                struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-                tmp = blkg_rwstat_read((void *)sc + off);
-                for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                        rwstat.cnt[i] += tmp.cnt[i];
-        }
-        return __blkg_prfill_rwstat(sf, pd, &rwstat);
+        /*
+         * Put the reference taken at the time of creation so that when all
+         * queues are gone, group can be destroyed.
+         */
+        throtl_put_tg(tg);
+        td->nr_undestroyed_grps--;
 }
-static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+static void throtl_release_tgs(struct throtl_data *td)
-                               struct seq_file *sf)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+        struct hlist_node *pos, *n;
+        struct throtl_grp *tg;
-        blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
+        hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-                          cft->private, true);
+                /*
-        return 0;
+                 * If cgroup removal path got to blk_group first and removed
+                 * it from cgroup list, then it will take care of destroying
+                 * cfqg also.
+                 */
+                if (!blkiocg_del_blkio_group(&tg->blkg))
+                        throtl_destroy_tg(td, tg);
+        }
 }
-static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
+static void throtl_td_free(struct throtl_data *td)
-                              int off)
 {
-        struct throtl_grp *tg = pd_to_tg(pd);
+        kfree(td);
-        u64 v = *(u64 *)((void *)tg + off);
-        if (v == -1)
-                return 0;
-        return __blkg_prfill_u64(sf, pd, v);
 }
-static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
+/*
-                               int off)
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid throtl_data pointer as long as we are
+ * rcu read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if queue was going away, cgroup deltion
+ * path got to it first.
+ */
+void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
 {
-        struct throtl_grp *tg = pd_to_tg(pd);
+        unsigned long flags;
-        unsigned int v = *(unsigned int *)((void *)tg + off);
+        struct throtl_data *td = key;
-        if (v == -1)
+        spin_lock_irqsave(td->queue->queue_lock, flags);
-                return 0;
+        throtl_destroy_tg(td, tg_of_blkg(blkg));
-        return __blkg_prfill_u64(sf, pd, v);
+        spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
-static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+static void throtl_update_blkio_group_common(struct throtl_data *td,
-                             struct seq_file *sf)
+                                struct throtl_grp *tg)
 {
-        blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
+        xchg(&tg->limits_changed, true);
-                          &blkcg_policy_throtl, cft->private, false);
+        xchg(&td->limits_changed, true);
-        return 0;
+        /* Schedule a work now to process the limit change */
+        throtl_schedule_delayed_work(td, 0);
 }
-static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+/*
-                              struct seq_file *sf)
+ * For all update functions, key should be a valid pointer because these
+ * update functions are called under blkcg_lock, that means, blkg is
+ * valid and in turn key is valid. queue exit path can not race because
+ * of blkcg_lock
+ *
+ * Can not take queue lock in update functions as queue lock under blkcg_lock
+ * is not allowed. Under other paths we take blkcg_lock under queue_lock.
+ */
+static void throtl_update_blkio_group_read_bps(void *key,
+                                struct blkio_group *blkg, u64 read_bps)
 {
-        blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
+        struct throtl_data *td = key;
-                          &blkcg_policy_throtl, cft->private, false);
+        struct throtl_grp *tg = tg_of_blkg(blkg);
-        return 0;
+        tg->bps[READ] = read_bps;
+        throtl_update_blkio_group_common(td, tg);
 }
-static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
+static void throtl_update_blkio_group_write_bps(void *key,
-                       bool is_u64)
+                                struct blkio_group *blkg, u64 write_bps)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+        struct throtl_data *td = key;
-        struct blkg_conf_ctx ctx;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
-        struct throtl_grp *tg;
-        struct throtl_data *td;
-        int ret;
-        ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-        if (ret)
-                return ret;
-        tg = blkg_to_tg(ctx.blkg);
-        td = ctx.blkg->q->td;
-        if (!ctx.v)
-                ctx.v = -1;
-        if (is_u64)
-                *(u64 *)((void *)tg + cft->private) = ctx.v;
-        else
-                *(unsigned int *)((void *)tg + cft->private) = ctx.v;
-        /* XXX: we don't need the following deferred processing */
-        xchg(&tg->limits_changed, true);
-        xchg(&td->limits_changed, true);
-        throtl_schedule_delayed_work(td, 0);
-        blkg_conf_finish(&ctx);
+        tg->bps[WRITE] = write_bps;
-        return 0;
+        throtl_update_blkio_group_common(td, tg);
 }
-static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+static void throtl_update_blkio_group_read_iops(void *key,
-                           const char *buf)
+                        struct blkio_group *blkg, unsigned int read_iops)
 {
-        return tg_set_conf(cgrp, cft, buf, true);
+        struct throtl_data *td = key;
+        struct throtl_grp *tg = tg_of_blkg(blkg);
+        tg->iops[READ] = read_iops;
+        throtl_update_blkio_group_common(td, tg);
 }
-static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+static void throtl_update_blkio_group_write_iops(void *key,
-                            const char *buf)
+                        struct blkio_group *blkg, unsigned int write_iops)
 {
-        return tg_set_conf(cgrp, cft, buf, false);
+        struct throtl_data *td = key;
-}
+        struct throtl_grp *tg = tg_of_blkg(blkg);
-static struct cftype throtl_files[] = {
+        tg->iops[WRITE] = write_iops;
-        {
+        throtl_update_blkio_group_common(td, tg);
-                .name = "throttle.read_bps_device",
+}
-                .private = offsetof(struct throtl_grp, bps[READ]),
-                .read_seq_string = tg_print_conf_u64,
-                .write_string = tg_set_conf_u64,
-                .max_write_len = 256,
-        },
-        {
-                .name = "throttle.write_bps_device",
-                .private = offsetof(struct throtl_grp, bps[WRITE]),
-                .read_seq_string = tg_print_conf_u64,
-                .write_string = tg_set_conf_u64,
-                .max_write_len = 256,
-        },
-        {
-                .name = "throttle.read_iops_device",
-                .private = offsetof(struct throtl_grp, iops[READ]),
-                .read_seq_string = tg_print_conf_uint,
-                .write_string = tg_set_conf_uint,
-                .max_write_len = 256,
-        },
-        {
-                .name = "throttle.write_iops_device",
-                .private = offsetof(struct throtl_grp, iops[WRITE]),
-                .read_seq_string = tg_print_conf_uint,
-                .write_string = tg_set_conf_uint,
-                .max_write_len = 256,
-        },
-        {
-                .name = "throttle.io_service_bytes",
-                .private = offsetof(struct tg_stats_cpu, service_bytes),
-                .read_seq_string = tg_print_cpu_rwstat,
-        },
-        {
-                .name = "throttle.io_serviced",
-                .private = offsetof(struct tg_stats_cpu, serviced),
-                .read_seq_string = tg_print_cpu_rwstat,
-        },
-        { }     /* terminate */
-};
 static void throtl_shutdown_wq(struct request_queue *q)
 {
@@ -1095,26 +1108,32 @@ static void throtl_shutdown_wq(struct request_queue *q)
        cancel_delayed_work_sync(&td->throtl_work);
 }
-static struct blkcg_policy blkcg_policy_throtl = {
+static struct blkio_policy_type blkio_policy_throtl = {
-        .pd_size                = sizeof(struct throtl_grp),
+        .ops = {
-        .cftypes                = throtl_files,
+                .blkio_unlink_group_fn = throtl_unlink_blkio_group,
+                .blkio_update_group_read_bps_fn =
-        .pd_init_fn             = throtl_pd_init,
+                                        throtl_update_blkio_group_read_bps,
-        .pd_exit_fn             = throtl_pd_exit,
+                .blkio_update_group_write_bps_fn =
-        .pd_reset_stats_fn      = throtl_pd_reset_stats,
+                                        throtl_update_blkio_group_write_bps,
+                .blkio_update_group_read_iops_fn =
+                                        throtl_update_blkio_group_read_iops,
+                .blkio_update_group_write_iops_fn =
+                                        throtl_update_blkio_group_write_iops,
+        },
+        .plid = BLKIO_POLICY_THROTL,
 };
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 {
        struct throtl_data *td = q->td;
        struct throtl_grp *tg;
+        struct bio *bio = *biop;
        bool rw = bio_data_dir(bio), update_disptime = true;
-        struct blkcg *blkcg;
+        struct blkio_cgroup *blkcg;
-        bool throttled = false;
        if (bio->bi_rw & REQ_THROTTLED) {
                bio->bi_rw &= ~REQ_THROTTLED;
-                goto out;
+                return 0;
        }
        /*
@@ -1122,25 +1141,38 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
         * basic fields like stats and io rates. If a group has no rules,
         * just update the dispatch stats in lockless manner and return.
         */
        rcu_read_lock();
-        blkcg = bio_blkcg(bio);
+        blkcg = task_blkio_cgroup(current);
-        tg = throtl_lookup_tg(td, blkcg);
+        tg = throtl_find_tg(td, blkcg);
        if (tg) {
+                throtl_tg_fill_dev_details(td, tg);
                if (tg_no_rule_group(tg, rw)) {
-                        throtl_update_dispatch_stats(tg_to_blkg(tg),
+                        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-                                                     bio->bi_size, bio->bi_rw);
+                                        rw, rw_is_sync(bio->bi_rw));
-                        goto out_unlock_rcu;
+                        rcu_read_unlock();
+                        return 0;
                }
        }
+        rcu_read_unlock();
        /*
         * Either group has not been allocated yet or it is not an unlimited
         * IO group
         */
        spin_lock_irq(q->queue_lock);
-        tg = throtl_lookup_create_tg(td, blkcg);
+        tg = throtl_get_tg(td);
-        if (unlikely(!tg))
-                goto out_unlock;
+        if (IS_ERR(tg)) {
+                if (PTR_ERR(tg) == -ENODEV) {
+                        /*
+                         * Queue is gone. No queue lock held here.
+                         */
+                        return -ENODEV;
+                }
+        }
        if (tg->nr_queued[rw]) {
                /*
@@ -1168,7 +1200,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
                 * So keep on trimming slice even if bio is not queued.
                 */
                throtl_trim_slice(td, tg, rw);
-                goto out_unlock;
+                goto out;
        }
 queue_bio:
@@ -1179,87 +1211,92 @@ queue_bio:
                        tg->io_disp[rw], tg->iops[rw],
                        tg->nr_queued[READ], tg->nr_queued[WRITE]);
-        bio_associate_current(bio);
        throtl_add_bio_tg(q->td, tg, bio);
-        throttled = true;
+        *biop = NULL;
        if (update_disptime) {
                tg_update_disptime(td, tg);
                throtl_schedule_next_dispatch(td);
        }
-out_unlock:
-        spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
-        rcu_read_unlock();
 out:
-        return throttled;
-}
-/**
- * blk_throtl_drain - drain throttled bios
- * @q: request_queue to drain throttled bios for
- *
- * Dispatch all currently throttled bios on @q through ->make_request_fn().
- */
-void blk_throtl_drain(struct request_queue *q)
-        __releases(q->queue_lock) __acquires(q->queue_lock)
-{
-        struct throtl_data *td = q->td;
-        struct throtl_rb_root *st = &td->tg_service_tree;
-        struct throtl_grp *tg;
-        struct bio_list bl;
-        struct bio *bio;
-        queue_lockdep_assert_held(q);
-        bio_list_init(&bl);
-        while ((tg = throtl_rb_first(st))) {
-                throtl_dequeue_tg(td, tg);
-                while ((bio = bio_list_peek(&tg->bio_lists[READ])))
-                        tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
-                while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-                        tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
-        }
        spin_unlock_irq(q->queue_lock);
+        return 0;
-        while ((bio = bio_list_pop(&bl)))
-                generic_make_request(bio);
-        spin_lock_irq(q->queue_lock);
 }
 int blk_throtl_init(struct request_queue *q)
 {
        struct throtl_data *td;
-        int ret;
+        struct throtl_grp *tg;
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
        if (!td)
                return -ENOMEM;
+        INIT_HLIST_HEAD(&td->tg_list);
        td->tg_service_tree = THROTL_RB_ROOT;
        td->limits_changed = false;
        INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
-        q->td = td;
+        /* alloc and Init root group. */
        td->queue = q;
+        tg = throtl_alloc_tg(td);
-        /* activate policy */
+        if (!tg) {
-        ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
-        if (ret)
                kfree(td);
-        return ret;
+                return -ENOMEM;
+        }
+        td->root_tg = tg;
+        rcu_read_lock();
+        throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
+        rcu_read_unlock();
+        /* Attach throtl data to request queue */
+        q->td = td;
+        return 0;
 }
 void blk_throtl_exit(struct request_queue *q)
 {
-        BUG_ON(!q->td);
+        struct throtl_data *td = q->td;
+        bool wait = false;
+        BUG_ON(!td);
        throtl_shutdown_wq(q);
-        blkcg_deactivate_policy(q, &blkcg_policy_throtl);
-        kfree(q->td);
+        spin_lock_irq(q->queue_lock);
+        throtl_release_tgs(td);
+        /* If there are other groups */
+        if (td->nr_undestroyed_grps > 0)
+                wait = true;
+        spin_unlock_irq(q->queue_lock);
+        /*
+         * Wait for tg->blkg->key accessors to exit their grace periods.
+         * Do this wait only if there are other undestroyed groups out
+         * there (other than root group). This can happen if cgroup deletion
+         * path claimed the responsibility of cleaning up a group before
+         * queue cleanup code get to the group.
+         *
+         * Do not call synchronize_rcu() unconditionally as there are drivers
+         * which create/delete request queue hundreds of times during scan/boot
+         * and synchronize_rcu() can take significant time and slow down boot.
+         */
+        if (wait)
+                synchronize_rcu();
+        /*
+         * Just being safe to make sure after previous flush if some body did
+         * update limits through cgroup and another work got queued, cancel
+         * it.
+         */
+        throtl_shutdown_wq(q);
+        throtl_td_free(td);
 }
 static int __init throtl_init(void)
@@ -1268,7 +1305,8 @@ static int __init throtl_init(void)
        if (!kthrotld_workqueue)
                panic("Failed to create kthrotld\n");
-        return blkcg_policy_register(&blkcg_policy_throtl);
+        blkio_policy_register(&blkio_policy_throtl);
+        return 0;
 }
 module_init(throtl_init);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 6e4744cbfb5..78035488895 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -197,3 +197,44 @@ void blk_add_timer(struct request *req)
                mod_timer(&q->timeout, expiry);
 }
+/**
+ * blk_abort_queue -- Abort all request on given queue
+ * @queue:      pointer to queue
+ *
+ */
+void blk_abort_queue(struct request_queue *q)
+{
+        unsigned long flags;
+        struct request *rq, *tmp;
+        LIST_HEAD(list);
+        /*
+         * Not a request based block device, nothing to abort
+         */
+        if (!q->request_fn)
+                return;
+        spin_lock_irqsave(q->queue_lock, flags);
+        elv_abort_queue(q);
+        /*
+         * Splice entries to local list, to avoid deadlocking if entries
+         * get readded to the timeout list by error handling
+         */
+        list_splice_init(&q->timeout_list, &list);
+        list_for_each_entry_safe(rq, tmp, &list, timeout_list)
+                blk_abort_request(rq);
+        /*
+         * Occasionally, blk_abort_request() will return without
+         * deleting the element from the list. Make sure we add those back
+         * instead of leaving them on the local stack list.
+         */
+        list_splice(&list, &q->timeout_list);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_abort_queue);
diff --git a/block/blk.h b/block/blk.h
index 47fdfdd4152..20b900a377c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -1,8 +1,6 @@
 #ifndef BLK_INTERNAL_H
 #define BLK_INTERNAL_H
-#include <linux/idr.h>
 /* Amount of time in which a process may batch requests */
 #define BLK_BATCH_TIME  (HZ/50UL)
@@ -11,23 +9,12 @@
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kobj_type blk_queue_ktype;
-extern struct ida blk_queue_ida;
-static inline void __blk_get_queue(struct request_queue *q)
-{
-        kobject_get(&q->kobj);
-}
-int blk_init_rl(struct request_list *rl, struct request_queue *q,
-                gfp_t gfp_mask);
-void blk_exit_rl(struct request_list *rl);
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                        struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
                      struct bio *bio);
-void blk_queue_bypass_start(struct request_queue *q);
-void blk_queue_bypass_end(struct request_queue *q);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
@@ -36,6 +23,7 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
+void __generic_unplug_device(struct request_queue *);
 /*
 * Internal atomic flags for request handling
@@ -96,8 +84,8 @@ static inline struct request *__elv_next_request(struct request_queue *q)
                        q->flush_queue_delayed = 1;
                        return NULL;
                }
-                if (unlikely(blk_queue_dying(q)) ||
+                if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
-                    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
+                    !q->elevator->ops->elevator_dispatch_fn(q, 0))
                        return NULL;
        }
 }
@@ -106,16 +94,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_activate_req_fn)
+        if (e->ops->elevator_activate_req_fn)
-                e->type->ops.elevator_activate_req_fn(q, rq);
+                e->ops->elevator_activate_req_fn(q, rq);
 }
 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_deactivate_req_fn)
+        if (e->ops->elevator_deactivate_req_fn)
-                e->type->ops.elevator_deactivate_req_fn(q, rq);
+                e->ops->elevator_deactivate_req_fn(q, rq);
 }
 #ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -130,6 +118,8 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
 }
 #endif
+struct io_context *current_io_context(gfp_t gfp_flags, int node);
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
                     struct bio *bio);
 int ll_front_merge_fn(struct request_queue *q, struct request *req, 
@@ -140,15 +130,14 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                                struct request *next);
 void blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
-bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
-int blk_try_merge(struct request *rq, struct bio *bio);
 void blk_queue_congestion_threshold(struct request_queue *q);
-void __blk_run_queue_uncond(struct request_queue *q);
 int blk_dev_init(void);
+void elv_quiesce_start(struct request_queue *q);
+void elv_quiesce_end(struct request_queue *q);
 /*
 * Return the threshold (number of used requests) at which the queue is
@@ -168,67 +157,35 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
        return q->nr_congestion_off;
 }
+static inline int blk_cpu_to_group(int cpu)
+{
+        int group = NR_CPUS;
+#ifdef CONFIG_SCHED_MC
+        const struct cpumask *mask = cpu_coregroup_mask(cpu);
+        group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+        group = cpumask_first(topology_thread_cpumask(cpu));
+#else
+        return cpu;
+#endif
+        if (likely(group < NR_CPUS))
+                return group;
+        return cpu;
+}
 /*
 * Contribute to IO statistics IFF:
 *
 *      a) it's attached to a gendisk, and
 *      b) the queue had IO stats enabled when this request was started, and
- *      c) it's a file system request
+ *      c) it's a file system request or a discard request
 */
 static inline int blk_do_io_stat(struct request *rq)
 {
        return rq->rq_disk &&
               (rq->cmd_flags & REQ_IO_STAT) &&
-                (rq->cmd_type == REQ_TYPE_FS);
+               (rq->cmd_type == REQ_TYPE_FS ||
-}
+                (rq->cmd_flags & REQ_DISCARD));
-/*
- * Internal io_context interface
- */
-void get_io_context(struct io_context *ioc);
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
-                             gfp_t gfp_mask);
-void ioc_clear_queue(struct request_queue *q);
-int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-/**
- * create_io_context - try to create task->io_context
- * @gfp_mask: allocation mask
- * @node: allocation node
- *
- * If %current->io_context is %NULL, allocate a new io_context and install
- * it.  Returns the current %current->io_context which may be %NULL if
- * allocation failed.
- *
- * Note that this function can't be called with IRQ disabled because
- * task_lock which protects %current->io_context is IRQ-unsafe.
- */
-static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
-{
-        WARN_ON_ONCE(irqs_disabled());
-        if (unlikely(!current->io_context))
-                create_task_io_context(current, gfp_mask, node);
-        return current->io_context;
 }
-/*
+#endif
- * Internal throttling interface
- */
-#ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
-extern void blk_throtl_drain(struct request_queue *q);
-extern int blk_throtl_init(struct request_queue *q);
-extern void blk_throtl_exit(struct request_queue *q);
-#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
-        return false;
-}
-static inline void blk_throtl_drain(struct request_queue *q) { }
-static inline int blk_throtl_init(struct request_queue *q) { return 0; }
-static inline void blk_throtl_exit(struct request_queue *q) { }
-#endif /* CONFIG_BLK_DEV_THROTTLING */
-#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 650f427d915..6690e6e4103 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -25,7 +25,7 @@
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/bsg-lib.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <scsi/scsi_cmnd.h>
 /**
@@ -151,6 +151,19 @@ failjob_rls_job:
        return -ENOMEM;
 }
+/*
+ * bsg_goose_queue - restart queue in case it was stopped
+ * @q: request q to be restarted
+ */
+void bsg_goose_queue(struct request_queue *q)
+{
+        if (!q)
+                return;
+        blk_run_queue_async(q);
+}
+EXPORT_SYMBOL_GPL(bsg_goose_queue);
 /**
 * bsg_request_fn - generic handler for bsg requests
 * @q: request queue to manage
@@ -230,3 +243,56 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
        return 0;
 }
 EXPORT_SYMBOL_GPL(bsg_setup_queue);
+/**
+ * bsg_remove_queue - Deletes the bsg dev from the q
+ * @q:  the request_queue that is to be torn down.
+ *
+ * Notes:
+ *   Before unregistering the queue empty any requests that are blocked
+ */
+void bsg_remove_queue(struct request_queue *q)
+{
+        struct request *req; /* block request */
+        int counts; /* totals for request_list count and starved */
+        if (!q)
+                return;
+        /* Stop taking in new requests */
+        spin_lock_irq(q->queue_lock);
+        blk_stop_queue(q);
+        /* drain all requests in the queue */
+        while (1) {
+                /* need the lock to fetch a request
+                 * this may fetch the same reqeust as the previous pass
+                 */
+                req = blk_fetch_request(q);
+                /* save requests in use and starved */
+                counts = q->rq.count[0] + q->rq.count[1] +
+                         q->rq.starved[0] + q->rq.starved[1];
+                spin_unlock_irq(q->queue_lock);
+                /* any requests still outstanding? */
+                if (counts == 0)
+                        break;
+                /* This may be the same req as the previous iteration,
+                 * always send the blk_end_request_all after a prefetch.
+                 * It is not okay to not end the request because the
+                 * prefetch started the request.
+                 */
+                if (req) {
+                        /* return -ENXIO to indicate that this queue is
+                         * going away
+                         */
+                        req->errors = -ENXIO;
+                        blk_end_request_all(req, -ENXIO);
+                }
+                msleep(200); /* allow bsg to possibly finish */
+                spin_lock_irq(q->queue_lock);
+        }
+        bsg_unregister_queue(q);
+}
+EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/block/bsg.c b/block/bsg.c
index ff64ae3bace..702f1316bb8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -769,10 +769,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
                                         struct file *file)
 {
        struct bsg_device *bd;
+        int ret;
 #ifdef BSG_DEBUG
        unsigned char buf[32];
 #endif
-        if (!blk_get_queue(rq))
+        ret = blk_get_queue(rq);
+        if (ret)
                return ERR_PTR(-ENXIO);
        bd = bsg_alloc_device();
@@ -983,8 +985,7 @@ void bsg_unregister_queue(struct request_queue *q)
        mutex_lock(&bsg_mutex);
        idr_remove(&bsg_minor_idr, bcd->minor);
-        if (q->kobj.sd)
+        sysfs_remove_link(&q->kobj, "bsg");
-                sysfs_remove_link(&q->kobj, "bsg");
        device_unregister(bcd->class_dev);
        bcd->class_dev = NULL;
        kref_put(&bcd->ref, bsg_kref_release_function);
@@ -1069,7 +1070,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue);
 static struct cdev bsg_cdev;
-static char *bsg_devnode(struct device *dev, umode_t *mode)
+static char *bsg_devnode(struct device *dev, mode_t *mode)
 {
        return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e62e9205b80..4c12869fcf7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,8 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
-#include "blk.h"
+#include "cfq.h"
-#include "blk-cgroup.h"
 /*
 * tunables
@@ -54,11 +53,20 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
-#define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
+#define RQ_CIC(rq)              \
-#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
+        ((struct cfq_io_context *) (rq)->elevator_private[0])
-#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
+#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private[1])
+#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private[2])
 static struct kmem_cache *cfq_pool;
+static struct kmem_cache *cfq_ioc_pool;
+static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
+static struct completion *ioc_gone;
+static DEFINE_SPINLOCK(ioc_gone_lock);
+static DEFINE_SPINLOCK(cic_index_lock);
+static DEFINE_IDA(cic_index_ida);
 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -67,14 +75,6 @@ static struct kmem_cache *cfq_pool;
 #define sample_valid(samples)   ((samples) > 80)
 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
-struct cfq_ttime {
-        unsigned long last_end_request;
-        unsigned long ttime_total;
-        unsigned long ttime_samples;
-        unsigned long ttime_mean;
-};
 /*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
@@ -171,53 +171,8 @@ enum wl_type_t {
        SYNC_WORKLOAD = 2
 };
-struct cfqg_stats {
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-        /* total bytes transferred */
-        struct blkg_rwstat              service_bytes;
-        /* total IOs serviced, post merge */
-        struct blkg_rwstat              serviced;
-        /* number of ios merged */
-        struct blkg_rwstat              merged;
-        /* total time spent on device in ns, may not be accurate w/ queueing */
-        struct blkg_rwstat              service_time;
-        /* total time spent waiting in scheduler queue in ns */
-        struct blkg_rwstat              wait_time;
-        /* number of IOs queued up */
-        struct blkg_rwstat              queued;
-        /* total sectors transferred */
-        struct blkg_stat                sectors;
-        /* total disk time and nr sectors dispatched by this group */
-        struct blkg_stat                time;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-        /* time not charged to this cgroup */
-        struct blkg_stat                unaccounted_time;
-        /* sum of number of ios queued across all samples */
-        struct blkg_stat                avg_queue_size_sum;
-        /* count of samples taken for average */
-        struct blkg_stat                avg_queue_size_samples;
-        /* how many times this group has been removed from service tree */
-        struct blkg_stat                dequeue;
-        /* total time spent waiting for it to be assigned a timeslice. */
-        struct blkg_stat                group_wait_time;
-        /* time spent idling for this blkcg_gq */
-        struct blkg_stat                idle_time;
-        /* total time with empty current active q with other requests queued */
-        struct blkg_stat                empty_time;
-        /* fields after this shouldn't be cleared on stat reset */
-        uint64_t                        start_group_wait_time;
-        uint64_t                        start_idle_time;
-        uint64_t                        start_empty_time;
-        uint16_t                        flags;
-#endif  /* CONFIG_DEBUG_BLK_CGROUP */
-#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
-};
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
-        /* must be the first member */
-        struct blkg_policy_data pd;
        /* group service_tree member */
        struct rb_node rb_node;
@@ -225,7 +180,7 @@ struct cfq_group {
        u64 vdisktime;
        unsigned int weight;
        unsigned int new_weight;
-        unsigned int dev_weight;
+        bool needs_update;
        /* number of cfqq currently on this group */
        int nr_cfqq;
@@ -251,21 +206,14 @@ struct cfq_group {
        unsigned long saved_workload_slice;
        enum wl_type_t saved_workload;
        enum wl_prio_t saved_serving_prio;
+        struct blkio_group blkg;
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+        struct hlist_node cfqd_node;
+        int ref;
+#endif
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
        struct cfq_ttime ttime;
-        struct cfqg_stats stats;
-};
-struct cfq_io_cq {
-        struct io_cq            icq;            /* must be the first member */
-        struct cfq_queue        *cfqq[2];
-        struct cfq_ttime        ttime;
-        int                     ioprio;         /* the current ioprio */
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-        uint64_t                blkcg_id;       /* the current blkcg ID */
-#endif
 };
 /*
@@ -275,7 +223,7 @@ struct cfq_data {
        struct request_queue *queue;
        /* Root service tree for cfq_groups */
        struct cfq_rb_root grp_service_tree;
-        struct cfq_group *root_group;
+        struct cfq_group root_group;
        /*
         * The priority currently being served
@@ -319,7 +267,7 @@ struct cfq_data {
        struct work_struct unplug_work;
        struct cfq_queue *active_queue;
-        struct cfq_io_cq *active_cic;
+        struct cfq_io_context *active_cic;
        /*
         * async queue for each priority case
@@ -341,7 +289,9 @@ struct cfq_data {
        unsigned int cfq_slice_idle;
        unsigned int cfq_group_idle;
        unsigned int cfq_latency;
-        unsigned int cfq_target_latency;
+        unsigned int cic_index;
+        struct list_head cic_list;
        /*
         * Fallback dummy cfqq for extreme OOM conditions
@@ -349,6 +299,12 @@ struct cfq_data {
        struct cfq_queue oom_cfqq;
        unsigned long last_delayed_sync;
+        /* List of cfq groups being managed on this device*/
+        struct hlist_head cfqg_list;
+        /* Number of groups which are on blkcg->blkg_list */
+        unsigned int nr_blkcg_linked_grps;
 };
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -411,286 +367,21 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
-static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
-{
-        return pd ? container_of(pd, struct cfq_group, pd) : NULL;
-}
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-        return pd_to_blkg(&cfqg->pd);
-}
-#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-/* cfqg stats flags */
-enum cfqg_stats_flags {
-        CFQG_stats_waiting = 0,
-        CFQG_stats_idling,
-        CFQG_stats_empty,
-};
-#define CFQG_FLAG_FNS(name)                                             \
-static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)     \
-{                                                                       \
-        stats->flags |= (1 << CFQG_stats_##name);                       \
-}                                                                       \
-static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)    \
-{                                                                       \
-        stats->flags &= ~(1 << CFQG_stats_##name);                      \
-}                                                                       \
-static inline int cfqg_stats_##name(struct cfqg_stats *stats)           \
-{                                                                       \
-        return (stats->flags & (1 << CFQG_stats_##name)) != 0;          \
-}                                                                       \
-CFQG_FLAG_FNS(waiting)
-CFQG_FLAG_FNS(idling)
-CFQG_FLAG_FNS(empty)
-#undef CFQG_FLAG_FNS
-/* This should be called with the queue_lock held. */
-static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
-{
-        unsigned long long now;
-        if (!cfqg_stats_waiting(stats))
-                return;
-        now = sched_clock();
-        if (time_after64(now, stats->start_group_wait_time))
-                blkg_stat_add(&stats->group_wait_time,
-                              now - stats->start_group_wait_time);
-        cfqg_stats_clear_waiting(stats);
-}
-/* This should be called with the queue_lock held. */
-static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
-                                                 struct cfq_group *curr_cfqg)
-{
-        struct cfqg_stats *stats = &cfqg->stats;
-        if (cfqg_stats_waiting(stats))
-                return;
-        if (cfqg == curr_cfqg)
-                return;
-        stats->start_group_wait_time = sched_clock();
-        cfqg_stats_mark_waiting(stats);
-}
-/* This should be called with the queue_lock held. */
-static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
-{
-        unsigned long long now;
-        if (!cfqg_stats_empty(stats))
-                return;
-        now = sched_clock();
-        if (time_after64(now, stats->start_empty_time))
-                blkg_stat_add(&stats->empty_time,
-                              now - stats->start_empty_time);
-        cfqg_stats_clear_empty(stats);
-}
-static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
-{
-        blkg_stat_add(&cfqg->stats.dequeue, 1);
-}
-static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
-{
-        struct cfqg_stats *stats = &cfqg->stats;
-        if (blkg_rwstat_sum(&stats->queued))
-                return;
-        /*
-         * group is already marked empty. This can happen if cfqq got new
-         * request in parent group and moved to this group while being added
-         * to service tree. Just ignore the event and move on.
-         */
-        if (cfqg_stats_empty(stats))
-                return;
-        stats->start_empty_time = sched_clock();
-        cfqg_stats_mark_empty(stats);
-}
-static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
-{
-        struct cfqg_stats *stats = &cfqg->stats;
-        if (cfqg_stats_idling(stats)) {
-                unsigned long long now = sched_clock();
-                if (time_after64(now, stats->start_idle_time))
-                        blkg_stat_add(&stats->idle_time,
-                                      now - stats->start_idle_time);
-                cfqg_stats_clear_idling(stats);
-        }
-}
-static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
-{
-        struct cfqg_stats *stats = &cfqg->stats;
-        BUG_ON(cfqg_stats_idling(stats));
-        stats->start_idle_time = sched_clock();
-        cfqg_stats_mark_idling(stats);
-}
-static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
-{
-        struct cfqg_stats *stats = &cfqg->stats;
-        blkg_stat_add(&stats->avg_queue_size_sum,
-                      blkg_rwstat_sum(&stats->queued));
-        blkg_stat_add(&stats->avg_queue_size_samples, 1);
-        cfqg_stats_update_group_wait_time(stats);
-}
-#else   /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
-static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
-static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
-static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
-#endif  /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
-static struct blkcg_policy blkcg_policy_cfq;
-static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
-{
-        return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
-}
-static inline void cfqg_get(struct cfq_group *cfqg)
-{
-        return blkg_get(cfqg_to_blkg(cfqg));
-}
-static inline void cfqg_put(struct cfq_group *cfqg)
-{
-        return blkg_put(cfqg_to_blkg(cfqg));
-}
-#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  do {                    \
-        char __pbuf[128];                                               \
-                                                                        \
-        blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));  \
        blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
-                          cfq_cfqq_sync((cfqq)) ? 'S' : 'A',            \
+                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-                          __pbuf, ##args);                              \
+                        blkg_path(&(cfqq)->cfqg->blkg), ##args)
-} while (0)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)  do {                    \
-        char __pbuf[128];                                               \
-                                                                        \
-        blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));          \
-        blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);    \
-} while (0)
-static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-                                            struct cfq_group *curr_cfqg, int rw)
-{
-        blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
-        cfqg_stats_end_empty_time(&cfqg->stats);
-        cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
-}
-static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)                          \
-                        unsigned long time, unsigned long unaccounted_time)
+        blk_add_trace_msg((cfqd)->queue, "%s " fmt,                     \
-{
+                                blkg_path(&(cfqg)->blkg), ##args)       \
-        blkg_stat_add(&cfqg->stats.time, time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-        blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
-#endif
-}
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
-{
-        blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
-}
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
-{
-        blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
-}
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                              uint64_t bytes, int rw)
-{
-        blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-        blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
-        blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
-}
-static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-                        uint64_t start_time, uint64_t io_start_time, int rw)
-{
-        struct cfqg_stats *stats = &cfqg->stats;
-        unsigned long long now = sched_clock();
-        if (time_after64(now, io_start_time))
-                blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
-        if (time_after64(io_start_time, start_time))
-                blkg_rwstat_add(&stats->wait_time, rw,
-                                io_start_time - start_time);
-}
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
-{
-        struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-        struct cfqg_stats *stats = &cfqg->stats;
-        /* queued stats shouldn't be cleared */
-        blkg_rwstat_reset(&stats->service_bytes);
-        blkg_rwstat_reset(&stats->serviced);
-        blkg_rwstat_reset(&stats->merged);
-        blkg_rwstat_reset(&stats->service_time);
-        blkg_rwstat_reset(&stats->wait_time);
-        blkg_stat_reset(&stats->time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-        blkg_stat_reset(&stats->unaccounted_time);
-        blkg_stat_reset(&stats->avg_queue_size_sum);
-        blkg_stat_reset(&stats->avg_queue_size_samples);
-        blkg_stat_reset(&stats->dequeue);
-        blkg_stat_reset(&stats->group_wait_time);
-        blkg_stat_reset(&stats->idle_time);
-        blkg_stat_reset(&stats->empty_time);
-#endif
-}
-#else   /* CONFIG_CFQ_GROUP_IOSCHED */
-static inline void cfqg_get(struct cfq_group *cfqg) { }
-static inline void cfqg_put(struct cfq_group *cfqg) { }
+#else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
        blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
+#endif
-static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-                        struct cfq_group *curr_cfqg, int rw) { }
-static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
-                        unsigned long time, unsigned long unaccounted_time) { }
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                              uint64_t bytes, int rw) { }
-static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-                        uint64_t start_time, uint64_t io_start_time, int rw) { }
-#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 #define cfq_log(cfqd, fmt, args...)     \
        blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
@@ -771,38 +462,39 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 }
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
+static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
-                                       struct cfq_io_cq *cic, struct bio *bio,
+                                       struct io_context *, gfp_t);
-                                       gfp_t gfp_mask);
+static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
+                                                struct io_context *);
-static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
+static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
+                                            bool is_sync)
 {
-        /* cic->icq is the first member, %NULL will convert to %NULL */
+        return cic->cfqq[is_sync];
-        return container_of(icq, struct cfq_io_cq, icq);
 }
-static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
+static inline void cic_set_cfqq(struct cfq_io_context *cic,
-                                               struct io_context *ioc)
+                                struct cfq_queue *cfqq, bool is_sync)
 {
-        if (ioc)
+        cic->cfqq[is_sync] = cfqq;
-                return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
-        return NULL;
 }
-static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
+#define CIC_DEAD_KEY    1ul
-{
+#define CIC_DEAD_INDEX_SHIFT    1
-        return cic->cfqq[is_sync];
-}
-static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
+static inline void *cfqd_dead_key(struct cfq_data *cfqd)
-                                bool is_sync)
 {
-        cic->cfqq[is_sync] = cfqq;
+        return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
 }
-static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
 {
-        return cic->icq.q->elevator->elevator_data;
+        struct cfq_data *cfqd = cic->key;
+        if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
+                return NULL;
+        return cfqd;
 }
 /*
@@ -851,7 +543,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
 {
        u64 d = delta << CFQ_SERVICE_SHIFT;
-        d = d * CFQ_WEIGHT_DEFAULT;
+        d = d * BLKIO_WEIGHT_DEFAULT;
        do_div(d, cfqg->weight);
        return d;
 }
@@ -911,7 +603,7 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
-        return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
+        return cfq_target_latency * cfqg->weight / st->total_weight;
 }
 static inline unsigned
@@ -1178,9 +870,9 @@ static void
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
-        if (cfqg->new_weight) {
+        if (cfqg->needs_update) {
                cfqg->weight = cfqg->new_weight;
-                cfqg->new_weight = 0;
+                cfqg->needs_update = false;
        }
 }
@@ -1242,7 +934,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
        cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
        cfq_group_service_tree_del(st, cfqg);
        cfqg->saved_workload_slice = 0;
-        cfqg_stats_update_dequeue(cfqg);
+        cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1314,59 +1006,178 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
                     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
                     used_sl, cfqq->slice_dispatch, charge,
                     iops_mode(cfqd), cfqq->nr_sectors);
-        cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
+        cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
-        cfqg_stats_set_start_empty_time(cfqg);
+                                          unaccounted_sl);
+        cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
-/**
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
- * cfq_init_cfqg_base - initialize base part of a cfq_group
+static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
- * @cfqg: cfq_group to initialize
+{
- *
+        if (blkg)
- * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
+                return container_of(blkg, struct cfq_group, blkg);
- * is enabled or not.
+        return NULL;
+}
+static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+                                          unsigned int weight)
+{
+        struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+        cfqg->new_weight = weight;
+        cfqg->needs_update = true;
+}
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
+                        struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
+{
+        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+        unsigned int major, minor;
+        /*
+         * Add group onto cgroup list. It might happen that bdi->dev is
+         * not initialized yet. Initialize this new group without major
+         * and minor info and this info will be filled in once a new thread
+         * comes for IO.
+         */
+        if (bdi->dev) {
+                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+                                        (void *)cfqd, MKDEV(major, minor));
+        } else
+                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+                                        (void *)cfqd, 0);
+        cfqd->nr_blkcg_linked_grps++;
+        cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+        /* Add group on cfqd list */
+        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
 */
-static void cfq_init_cfqg_base(struct cfq_group *cfqg)
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 {
+        struct cfq_group *cfqg = NULL;
+        int i, j, ret;
        struct cfq_rb_root *st;
-        int i, j;
+        cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+        if (!cfqg)
+                return NULL;
        for_each_cfqg_st(cfqg, i, j, st)
                *st = CFQ_RB_ROOT;
        RB_CLEAR_NODE(&cfqg->rb_node);
        cfqg->ttime.last_end_request = jiffies;
+        /*
+         * Take the initial reference that will be released on destroy
+         * This can be thought of a joint reference by cgroup and
+         * elevator which will be dropped by either elevator exit
+         * or cgroup deletion path depending on who is exiting first.
+         */
+        cfqg->ref = 1;
+        ret = blkio_alloc_blkg_stats(&cfqg->blkg);
+        if (ret) {
+                kfree(cfqg);
+                return NULL;
+        }
+        return cfqg;
 }
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static struct cfq_group *
-static void cfq_pd_init(struct blkcg_gq *blkg)
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
 {
-        struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+        struct cfq_group *cfqg = NULL;
+        void *key = cfqd;
+        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+        unsigned int major, minor;
-        cfq_init_cfqg_base(cfqg);
+        /*
-        cfqg->weight = blkg->blkcg->cfq_weight;
+         * This is the common case when there are no blkio cgroups.
+         * Avoid lookup in this case
+         */
+        if (blkcg == &blkio_root_cgroup)
+                cfqg = &cfqd->root_group;
+        else
+                cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+        if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                cfqg->blkg.dev = MKDEV(major, minor);
+        }
+        return cfqg;
 }
 /*
 * Search for the cfq group current task belongs to. request_queue lock must
 * be held.
 */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
-                                                struct blkcg *blkcg)
 {
+        struct blkio_cgroup *blkcg;
+        struct cfq_group *cfqg = NULL, *__cfqg = NULL;
        struct request_queue *q = cfqd->queue;
-        struct cfq_group *cfqg = NULL;
-        /* avoid lookup for the common case where there's no blkcg */
+        rcu_read_lock();
-        if (blkcg == &blkcg_root) {
+        blkcg = task_blkio_cgroup(current);
-                cfqg = cfqd->root_group;
+        cfqg = cfq_find_cfqg(cfqd, blkcg);
-        } else {
+        if (cfqg) {
-                struct blkcg_gq *blkg;
+                rcu_read_unlock();
+                return cfqg;
+        }
+        /*
+         * Need to allocate a group. Allocation of group also needs allocation
+         * of per cpu stats which in-turn takes a mutex() and can block. Hence
+         * we need to drop rcu lock and queue_lock before we call alloc.
+         *
+         * Not taking any queue reference here and assuming that queue is
+         * around by the time we return. CFQ queue allocation code does
+         * the same. It might be racy though.
+         */
+        rcu_read_unlock();
+        spin_unlock_irq(q->queue_lock);
+        cfqg = cfq_alloc_cfqg(cfqd);
-                blkg = blkg_lookup_create(blkcg, q);
+        spin_lock_irq(q->queue_lock);
-                if (!IS_ERR(blkg))
-                        cfqg = blkg_to_cfqg(blkg);
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        /*
+         * If some other thread already allocated the group while we were
+         * not holding queue lock, free up the group
+         */
+        __cfqg = cfq_find_cfqg(cfqd, blkcg);
+        if (__cfqg) {
+                kfree(cfqg);
+                rcu_read_unlock();
+                return __cfqg;
        }
+        if (!cfqg)
+                cfqg = &cfqd->root_group;
+        cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
+        rcu_read_unlock();
+        return cfqg;
+}
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+        cfqg->ref++;
        return cfqg;
 }
@@ -1374,224 +1185,94 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
        /* Currently, all async queues are mapped to root group */
        if (!cfq_cfqq_sync(cfqq))
-                cfqg = cfqq->cfqd->root_group;
+                cfqg = &cfqq->cfqd->root_group;
        cfqq->cfqg = cfqg;
        /* cfqq reference on cfqg */
-        cfqg_get(cfqg);
+        cfqq->cfqg->ref++;
 }
-static u64 cfqg_prfill_weight_device(struct seq_file *sf,
+static void cfq_put_cfqg(struct cfq_group *cfqg)
-                                     struct blkg_policy_data *pd, int off)
 {
-        struct cfq_group *cfqg = pd_to_cfqg(pd);
+        struct cfq_rb_root *st;
+        int i, j;
-        if (!cfqg->dev_weight)
-                return 0;
-        return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
-}
-static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
-                                    struct seq_file *sf)
-{
-        blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
-                          cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
-                          false);
-        return 0;
-}
-static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
+        BUG_ON(cfqg->ref <= 0);
-                            struct seq_file *sf)
+        cfqg->ref--;
-{
+        if (cfqg->ref)
-        seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
+                return;
-        return 0;
+        for_each_cfqg_st(cfqg, i, j, st)
+                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+        free_percpu(cfqg->blkg.stats_cpu);
+        kfree(cfqg);
 }
-static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
-                                  const char *buf)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+        /* Something wrong if we are trying to remove same group twice */
-        struct blkg_conf_ctx ctx;
+        BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
-        struct cfq_group *cfqg;
-        int ret;
-        ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
+        hlist_del_init(&cfqg->cfqd_node);
-        if (ret)
-                return ret;
-        ret = -EINVAL;
+        BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
-        cfqg = blkg_to_cfqg(ctx.blkg);
+        cfqd->nr_blkcg_linked_grps--;
-        if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
-                cfqg->dev_weight = ctx.v;
-                cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
-                ret = 0;
-        }
-        blkg_conf_finish(&ctx);
+        /*
-        return ret;
+         * Put the reference taken at the time of creation so that when all
+         * queues are gone, group can be destroyed.
+         */
+        cfq_put_cfqg(cfqg);
 }
-static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static void cfq_release_cfq_groups(struct cfq_data *cfqd)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+        struct hlist_node *pos, *n;
-        struct blkcg_gq *blkg;
+        struct cfq_group *cfqg;
-        struct hlist_node *n;
-        if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
-                return -EINVAL;
-        spin_lock_irq(&blkcg->lock);
-        blkcg->cfq_weight = (unsigned int)val;
-        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-                struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-                if (cfqg && !cfqg->dev_weight)
+        hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
-                        cfqg->new_weight = blkcg->cfq_weight;
+                /*
+                 * If cgroup removal path got to blk_group first and removed
+                 * it from cgroup list, then it will take care of destroying
+                 * cfqg also.
+                 */
+                if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+                        cfq_destroy_cfqg(cfqd, cfqg);
        }
-        spin_unlock_irq(&blkcg->lock);
-        return 0;
 }
-static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+/*
-                           struct seq_file *sf)
+ * Blk cgroup controller notification saying that blkio_group object is being
-{
+ * delinked as associated cgroup object is going away. That also means that
-        struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
-        blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
+ *
-                          cft->private, false);
+ * This function is called under rcu_read_lock(). key is the rcu protected
-        return 0;
+ * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
-}
+ * read lock.
+ *
-static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
-                             struct seq_file *sf)
+ * it should not be NULL as even if elevator was exiting, cgroup deltion
-{
+ * path got to it first.
-        struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+ */
+static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
-        blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
-                          cft->private, true);
-        return 0;
-}
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
-                                      struct blkg_policy_data *pd, int off)
 {
-        struct cfq_group *cfqg = pd_to_cfqg(pd);
+        unsigned long  flags;
-        u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
+        struct cfq_data *cfqd = key;
-        u64 v = 0;
-        if (samples) {
+        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-                v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
+        cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
-                do_div(v, samples);
+        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-        }
-        __blkg_prfill_u64(sf, pd, v);
-        return 0;
 }
-/* print avg_queue_size */
+#else /* GROUP_IOSCHED */
-static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
-                                     struct seq_file *sf)
 {
-        struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+        return &cfqd->root_group;
-        blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
-                          &blkcg_policy_cfq, 0, false);
-        return 0;
 }
-#endif  /* CONFIG_DEBUG_BLK_CGROUP */
-static struct cftype cfq_blkcg_files[] = {
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
-        {
-                .name = "weight_device",
-                .read_seq_string = cfqg_print_weight_device,
-                .write_string = cfqg_set_weight_device,
-                .max_write_len = 256,
-        },
-        {
-                .name = "weight",
-                .read_seq_string = cfq_print_weight,
-                .write_u64 = cfq_set_weight,
-        },
-        {
-                .name = "time",
-                .private = offsetof(struct cfq_group, stats.time),
-                .read_seq_string = cfqg_print_stat,
-        },
-        {
-                .name = "sectors",
-                .private = offsetof(struct cfq_group, stats.sectors),
-                .read_seq_string = cfqg_print_stat,
-        },
-        {
-                .name = "io_service_bytes",
-                .private = offsetof(struct cfq_group, stats.service_bytes),
-                .read_seq_string = cfqg_print_rwstat,
-        },
-        {
-                .name = "io_serviced",
-                .private = offsetof(struct cfq_group, stats.serviced),
-                .read_seq_string = cfqg_print_rwstat,
-        },
-        {
-                .name = "io_service_time",
-                .private = offsetof(struct cfq_group, stats.service_time),
-                .read_seq_string = cfqg_print_rwstat,
-        },
-        {
-                .name = "io_wait_time",
-                .private = offsetof(struct cfq_group, stats.wait_time),
-                .read_seq_string = cfqg_print_rwstat,
-        },
-        {
-                .name = "io_merged",
-                .private = offsetof(struct cfq_group, stats.merged),
-                .read_seq_string = cfqg_print_rwstat,
-        },
-        {
-                .name = "io_queued",
-                .private = offsetof(struct cfq_group, stats.queued),
-                .read_seq_string = cfqg_print_rwstat,
-        },
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-        {
-                .name = "avg_queue_size",
-                .read_seq_string = cfqg_print_avg_queue_size,
-        },
-        {
-                .name = "group_wait_time",
-                .private = offsetof(struct cfq_group, stats.group_wait_time),
-                .read_seq_string = cfqg_print_stat,
-        },
-        {
-                .name = "idle_time",
-                .private = offsetof(struct cfq_group, stats.idle_time),
-                .read_seq_string = cfqg_print_stat,
-        },
-        {
-                .name = "empty_time",
-                .private = offsetof(struct cfq_group, stats.empty_time),
-                .read_seq_string = cfqg_print_stat,
-        },
-        {
-                .name = "dequeue",
-                .private = offsetof(struct cfq_group, stats.dequeue),
-                .read_seq_string = cfqg_print_stat,
-        },
-        {
-                .name = "unaccounted_time",
-                .private = offsetof(struct cfq_group, stats.unaccounted_time),
-                .read_seq_string = cfqg_print_stat,
-        },
-#endif  /* CONFIG_DEBUG_BLK_CGROUP */
-        { }     /* terminate */
-};
-#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-                                                struct blkcg *blkcg)
 {
-        return cfqd->root_group;
+        return cfqg;
 }
 static inline void
@@ -1599,6 +1280,9 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
        cfqq->cfqg = cfqg;
 }
+static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
+static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 #endif /* GROUP_IOSCHED */
 /*
@@ -1865,17 +1549,19 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
        elv_rb_del(&cfqq->sort_list, rq);
        cfqq->queued[rq_is_sync(rq)]--;
-        cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
+        cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+                                        rq_data_dir(rq), rq_is_sync(rq));
        cfq_add_rq_rb(rq);
-        cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
+        cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
-                                 rq->cmd_flags);
+                        &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+                        rq_is_sync(rq));
 }
 static struct request *
 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
        struct task_struct *tsk = current;
-        struct cfq_io_cq *cic;
+        struct cfq_io_context *cic;
        struct cfq_queue *cfqq;
        cic = cfq_cic_lookup(cfqd, tsk->io_context);
@@ -1924,7 +1610,8 @@ static void cfq_remove_request(struct request *rq)
        cfq_del_rq_rb(rq);
        cfqq->cfqd->rq_queued--;
-        cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
+        cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+                                        rq_data_dir(rq), rq_is_sync(rq));
        if (rq->cmd_flags & REQ_PRIO) {
                WARN_ON(!cfqq->prio_pending);
                cfqq->prio_pending--;
@@ -1959,7 +1646,8 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
                                struct bio *bio)
 {
-        cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
+        cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
+                                        bio_data_dir(bio), cfq_bio_sync(bio));
 }
 static void
@@ -1967,14 +1655,11 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
                    struct request *next)
 {
        struct cfq_queue *cfqq = RQ_CFQQ(rq);
-        struct cfq_data *cfqd = q->elevator->elevator_data;
        /*
         * reposition in fifo if next is older than rq
         */
        if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
-            time_before(rq_fifo_time(next), rq_fifo_time(rq)) &&
+            time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
-            cfqq == RQ_CFQQ(next)) {
                list_move(&rq->queuelist, &next->queuelist);
                rq_set_fifo_time(rq, rq_fifo_time(next));
        }
@@ -1982,24 +1667,15 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
        if (cfqq->next_rq == next)
                cfqq->next_rq = rq;
        cfq_remove_request(next);
-        cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
+        cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
+                                        rq_data_dir(next), rq_is_sync(next));
-        cfqq = RQ_CFQQ(next);
-        /*
-         * all requests of this queue are merged to other queues, delete it
-         * from the service tree. If it's the active_queue,
-         * cfq_dispatch_requests() will choose to expire it or do idle
-         */
-        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
-            cfqq != cfqd->active_queue)
-                cfq_del_cfqq_rr(cfqd, cfqq);
 }
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
                           struct bio *bio)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-        struct cfq_io_cq *cic;
+        struct cfq_io_context *cic;
        struct cfq_queue *cfqq;
        /*
@@ -2009,7 +1685,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
                return false;
        /*
-         * Lookup the cfqq that this bio will be queued with and allow
+         * Lookup the cfqq that this bio will be queued with. Allow
         * merge only if rq is queued there.
         */
        cic = cfq_cic_lookup(cfqd, current->io_context);
@@ -2023,7 +1699,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
        del_timer(&cfqd->idle_slice_timer);
-        cfqg_stats_update_idle_time(cfqq->cfqg);
+        cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 }
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -2032,7 +1708,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
        if (cfqq) {
                cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
                                cfqd->serving_prio, cfqd->serving_type);
-                cfqg_stats_update_avg_queue_size(cfqq->cfqg);
+                cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
                cfqq->slice_start = 0;
                cfqq->dispatch_start = jiffies;
                cfqq->allocated_slice = 0;
@@ -2098,7 +1774,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                cfqd->active_queue = NULL;
        if (cfqd->active_cic) {
-                put_io_context(cfqd->active_cic->icq.ioc);
+                put_io_context(cfqd->active_cic->ioc);
                cfqd->active_cic = NULL;
        }
 }
@@ -2318,7 +1994,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
        struct cfq_queue *cfqq = cfqd->active_queue;
-        struct cfq_io_cq *cic;
+        struct cfq_io_context *cic;
        unsigned long sl, group_idle = 0;
        /*
@@ -2353,7 +2029,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
         * task has exited, don't wait
         */
        cic = cfqd->active_cic;
-        if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
+        if (!cic || !atomic_read(&cic->ioc->nr_tasks))
                return;
        /*
@@ -2380,7 +2056,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
                sl = cfqd->cfq_slice_idle;
        mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-        cfqg_stats_set_start_idle_time(cfqq->cfqg);
+        cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
                        group_idle ? 1 : 0);
 }
@@ -2403,7 +2079,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
        cfqq->nr_sectors += blk_rq_sectors(rq);
-        cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
+        cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+                                        rq_data_dir(rq), rq_is_sync(rq));
 }
 /*
@@ -2581,8 +2258,7 @@ new_workload:
                 * to have higher weight. A more accurate thing would be to
                 * calculate system wide asnc/sync ratio.
                 */
-                tmp = cfqd->cfq_target_latency *
+                tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
-                        cfqg_busy_async_queues(cfqd, cfqg);
                tmp = tmp/cfqd->busy_queues;
                slice = min_t(unsigned, slice, tmp);
@@ -2904,9 +2580,9 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        cfq_dispatch_insert(cfqd->queue, rq);
        if (!cfqd->active_cic) {
-                struct cfq_io_cq *cic = RQ_CIC(rq);
+                struct cfq_io_context *cic = RQ_CIC(rq);
-                atomic_long_inc(&cic->icq.ioc->refcount);
+                atomic_long_inc(&cic->ioc->refcount);
                cfqd->active_cic = cic;
        }
@@ -2986,7 +2662,85 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
        BUG_ON(cfq_cfqq_on_rr(cfqq));
        kmem_cache_free(cfq_pool, cfqq);
-        cfqg_put(cfqg);
+        cfq_put_cfqg(cfqg);
+}
+/*
+ * Call func for each cic attached to this ioc.
+ */
+static void
+call_for_each_cic(struct io_context *ioc,
+                  void (*func)(struct io_context *, struct cfq_io_context *))
+{
+        struct cfq_io_context *cic;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
+                func(ioc, cic);
+        rcu_read_unlock();
+}
+static void cfq_cic_free_rcu(struct rcu_head *head)
+{
+        struct cfq_io_context *cic;
+        cic = container_of(head, struct cfq_io_context, rcu_head);
+        kmem_cache_free(cfq_ioc_pool, cic);
+        elv_ioc_count_dec(cfq_ioc_count);
+        if (ioc_gone) {
+                /*
+                 * CFQ scheduler is exiting, grab exit lock and check
+                 * the pending io context count. If it hits zero,
+                 * complete ioc_gone and set it back to NULL
+                 */
+                spin_lock(&ioc_gone_lock);
+                if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
+                        complete(ioc_gone);
+                        ioc_gone = NULL;
+                }
+                spin_unlock(&ioc_gone_lock);
+        }
+}
+static void cfq_cic_free(struct cfq_io_context *cic)
+{
+        call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
+}
+static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+{
+        unsigned long flags;
+        unsigned long dead_key = (unsigned long) cic->key;
+        BUG_ON(!(dead_key & CIC_DEAD_KEY));
+        spin_lock_irqsave(&ioc->lock, flags);
+        radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
+        hlist_del_rcu(&cic->cic_list);
+        spin_unlock_irqrestore(&ioc->lock, flags);
+        cfq_cic_free(cic);
+}
+/*
+ * Must be called with rcu_read_lock() held or preemption otherwise disabled.
+ * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
+ * and ->trim() which is called with the task lock held
+ */
+static void cfq_free_io_context(struct io_context *ioc)
+{
+        /*
+         * ioc->refcount is zero here, or we are called from elv_unregister(),
+         * so no more cic's are allowed to be linked into this ioc.  So it
+         * should be ok to iterate over the known list, we will see all cic's
+         * since no new ones are added.
+         */
+        call_for_each_cic(ioc, cic_free_func);
 }
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -3022,17 +2776,27 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        cfq_put_queue(cfqq);
 }
-static void cfq_init_icq(struct io_cq *icq)
+static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
+                                         struct cfq_io_context *cic)
 {
-        struct cfq_io_cq *cic = icq_to_cic(icq);
+        struct io_context *ioc = cic->ioc;
-        cic->ttime.last_end_request = jiffies;
+        list_del_init(&cic->queue_list);
-}
-static void cfq_exit_icq(struct io_cq *icq)
+        /*
-{
+         * Make sure dead mark is seen for dead queues
-        struct cfq_io_cq *cic = icq_to_cic(icq);
+         */
-        struct cfq_data *cfqd = cic_to_cfqd(cic);
+        smp_wmb();
+        cic->key = cfqd_dead_key(cfqd);
+        rcu_read_lock();
+        if (rcu_dereference(ioc->ioc_data) == cic) {
+                rcu_read_unlock();
+                spin_lock(&ioc->lock);
+                rcu_assign_pointer(ioc->ioc_data, NULL);
+                spin_unlock(&ioc->lock);
+        } else
+                rcu_read_unlock();
        if (cic->cfqq[BLK_RW_ASYNC]) {
                cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -3045,7 +2809,58 @@ static void cfq_exit_icq(struct io_cq *icq)
        }
 }
-static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
+static void cfq_exit_single_io_context(struct io_context *ioc,
+                                       struct cfq_io_context *cic)
+{
+        struct cfq_data *cfqd = cic_to_cfqd(cic);
+        if (cfqd) {
+                struct request_queue *q = cfqd->queue;
+                unsigned long flags;
+                spin_lock_irqsave(q->queue_lock, flags);
+                /*
+                 * Ensure we get a fresh copy of the ->key to prevent
+                 * race between exiting task and queue
+                 */
+                smp_read_barrier_depends();
+                if (cic->key == cfqd)
+                        __cfq_exit_single_io_context(cfqd, cic);
+                spin_unlock_irqrestore(q->queue_lock, flags);
+        }
+}
+/*
+ * The process that ioc belongs to has exited, we need to clean up
+ * and put the internal structures we have that belongs to that process.
+ */
+static void cfq_exit_io_context(struct io_context *ioc)
+{
+        call_for_each_cic(ioc, cfq_exit_single_io_context);
+}
+static struct cfq_io_context *
+cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
+{
+        struct cfq_io_context *cic;
+        cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
+                                                        cfqd->queue->node);
+        if (cic) {
+                cic->ttime.last_end_request = jiffies;
+                INIT_LIST_HEAD(&cic->queue_list);
+                INIT_HLIST_NODE(&cic->cic_list);
+                cic->dtor = cfq_free_io_context;
+                cic->exit = cfq_exit_io_context;
+                elv_ioc_count_inc(cfq_ioc_count);
+        }
+        return cic;
+}
+static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
        struct task_struct *tsk = current;
        int ioprio_class;
@@ -3053,7 +2868,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
        if (!cfq_cfqq_prio_changed(cfqq))
                return;
-        ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
+        ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
        switch (ioprio_class) {
        default:
                printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -3065,11 +2880,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
                cfqq->ioprio_class = task_nice_ioclass(tsk);
                break;
        case IOPRIO_CLASS_RT:
-                cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
+                cfqq->ioprio = task_ioprio(ioc);
                cfqq->ioprio_class = IOPRIO_CLASS_RT;
                break;
        case IOPRIO_CLASS_BE:
-                cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
+                cfqq->ioprio = task_ioprio(ioc);
                cfqq->ioprio_class = IOPRIO_CLASS_BE;
                break;
        case IOPRIO_CLASS_IDLE:
@@ -3087,24 +2902,22 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
        cfq_clear_cfqq_prio_changed(cfqq);
 }
-static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
+static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 {
-        int ioprio = cic->icq.ioc->ioprio;
        struct cfq_data *cfqd = cic_to_cfqd(cic);
        struct cfq_queue *cfqq;
+        unsigned long flags;
-        /*
+        if (unlikely(!cfqd))
-         * Check whether ioprio has changed.  The condition may trigger
-         * spuriously on a newly created cic but there's no harm.
-         */
-        if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
                return;
+        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
        cfqq = cic->cfqq[BLK_RW_ASYNC];
        if (cfqq) {
                struct cfq_queue *new_cfqq;
-                new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
+                new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
-                                         GFP_ATOMIC);
+                                                GFP_ATOMIC);
                if (new_cfqq) {
                        cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
                        cfq_put_queue(cfqq);
@@ -3115,7 +2928,13 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
        if (cfqq)
                cfq_mark_cfqq_prio_changed(cfqq);
-        cic->ioprio = ioprio;
+        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+static void cfq_ioc_set_ioprio(struct io_context *ioc)
+{
+        call_for_each_cic(ioc, changed_ioprio);
+        ioc->ioprio_changed = 0;
 }
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -3139,24 +2958,20 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
 {
+        struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
        struct cfq_data *cfqd = cic_to_cfqd(cic);
-        struct cfq_queue *sync_cfqq;
+        unsigned long flags;
-        uint64_t id;
+        struct request_queue *q;
-        rcu_read_lock();
-        id = bio_blkcg(bio)->id;
-        rcu_read_unlock();
-        /*
+        if (unlikely(!cfqd))
-         * Check whether blkcg has changed.  The condition may trigger
-         * spuriously on a newly created cic but there's no harm.
-         */
-        if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
                return;
-        sync_cfqq = cic_to_cfqq(cic, 1);
+        q = cfqd->queue;
+        spin_lock_irqsave(q->queue_lock, flags);
        if (sync_cfqq) {
                /*
                 * Drop reference to sync queue. A new sync queue will be
@@ -3167,25 +2982,28 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
                cfq_put_queue(sync_cfqq);
        }
-        cic->blkcg_id = id;
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+static void cfq_ioc_set_cgroup(struct io_context *ioc)
+{
+        call_for_each_cic(ioc, changed_cgroup);
+        ioc->cgroup_changed = 0;
 }
-#else
-static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
+cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
-                     struct bio *bio, gfp_t gfp_mask)
+                     struct io_context *ioc, gfp_t gfp_mask)
 {
-        struct blkcg *blkcg;
        struct cfq_queue *cfqq, *new_cfqq = NULL;
+        struct cfq_io_context *cic;
        struct cfq_group *cfqg;
 retry:
-        rcu_read_lock();
+        cfqg = cfq_get_cfqg(cfqd);
+        cic = cfq_cic_lookup(cfqd, ioc);
-        blkcg = bio_blkcg(bio);
+        /* cic always exists here */
-        cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
        cfqq = cic_to_cfqq(cic, is_sync);
        /*
@@ -3198,7 +3016,6 @@ retry:
                        cfqq = new_cfqq;
                        new_cfqq = NULL;
                } else if (gfp_mask & __GFP_WAIT) {
-                        rcu_read_unlock();
                        spin_unlock_irq(cfqd->queue->queue_lock);
                        new_cfqq = kmem_cache_alloc_node(cfq_pool,
                                        gfp_mask | __GFP_ZERO,
@@ -3214,7 +3031,7 @@ retry:
                if (cfqq) {
                        cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-                        cfq_init_prio_data(cfqq, cic);
+                        cfq_init_prio_data(cfqq, ioc);
                        cfq_link_cfqq_cfqg(cfqq, cfqg);
                        cfq_log_cfqq(cfqd, cfqq, "alloced");
                } else
@@ -3224,7 +3041,6 @@ retry:
        if (new_cfqq)
                kmem_cache_free(cfq_pool, new_cfqq);
-        rcu_read_unlock();
        return cfqq;
 }
@@ -3234,9 +3050,6 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
        switch (ioprio_class) {
        case IOPRIO_CLASS_RT:
                return &cfqd->async_cfqq[0][ioprio];
-        case IOPRIO_CLASS_NONE:
-                ioprio = IOPRIO_NORM;
-                /* fall through */
        case IOPRIO_CLASS_BE:
                return &cfqd->async_cfqq[1][ioprio];
        case IOPRIO_CLASS_IDLE:
@@ -3247,11 +3060,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 }
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
+cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
-              struct bio *bio, gfp_t gfp_mask)
+              gfp_t gfp_mask)
 {
-        const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
+        const int ioprio = task_ioprio(ioc);
-        const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
+        const int ioprio_class = task_ioprio_class(ioc);
        struct cfq_queue **async_cfqq = NULL;
        struct cfq_queue *cfqq = NULL;
@@ -3261,7 +3074,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
        }
        if (!cfqq)
-                cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+                cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
        /*
         * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3275,6 +3088,160 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
        return cfqq;
 }
+/*
+ * We drop cfq io contexts lazily, so we may find a dead one.
+ */
+static void
+cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
+                  struct cfq_io_context *cic)
+{
+        unsigned long flags;
+        WARN_ON(!list_empty(&cic->queue_list));
+        BUG_ON(cic->key != cfqd_dead_key(cfqd));
+        spin_lock_irqsave(&ioc->lock, flags);
+        BUG_ON(rcu_dereference_check(ioc->ioc_data,
+                lockdep_is_held(&ioc->lock)) == cic);
+        radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
+        hlist_del_rcu(&cic->cic_list);
+        spin_unlock_irqrestore(&ioc->lock, flags);
+        cfq_cic_free(cic);
+}
+static struct cfq_io_context *
+cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+{
+        struct cfq_io_context *cic;
+        unsigned long flags;
+        if (unlikely(!ioc))
+                return NULL;
+        rcu_read_lock();
+        /*
+         * we maintain a last-hit cache, to avoid browsing over the tree
+         */
+        cic = rcu_dereference(ioc->ioc_data);
+        if (cic && cic->key == cfqd) {
+                rcu_read_unlock();
+                return cic;
+        }
+        do {
+                cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
+                rcu_read_unlock();
+                if (!cic)
+                        break;
+                if (unlikely(cic->key != cfqd)) {
+                        cfq_drop_dead_cic(cfqd, ioc, cic);
+                        rcu_read_lock();
+                        continue;
+                }
+                spin_lock_irqsave(&ioc->lock, flags);
+                rcu_assign_pointer(ioc->ioc_data, cic);
+                spin_unlock_irqrestore(&ioc->lock, flags);
+                break;
+        } while (1);
+        return cic;
+}
+/*
+ * Add cic into ioc, using cfqd as the search key. This enables us to lookup
+ * the process specific cfq io context when entered from the block layer.
+ * Also adds the cic to a per-cfqd list, used when this queue is removed.
+ */
+static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
+                        struct cfq_io_context *cic, gfp_t gfp_mask)
+{
+        unsigned long flags;
+        int ret;
+        ret = radix_tree_preload(gfp_mask);
+        if (!ret) {
+                cic->ioc = ioc;
+                cic->key = cfqd;
+                spin_lock_irqsave(&ioc->lock, flags);
+                ret = radix_tree_insert(&ioc->radix_root,
+                                                cfqd->cic_index, cic);
+                if (!ret)
+                        hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
+                spin_unlock_irqrestore(&ioc->lock, flags);
+                radix_tree_preload_end();
+                if (!ret) {
+                        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+                        list_add(&cic->queue_list, &cfqd->cic_list);
+                        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+                }
+        }
+        if (ret && ret != -EEXIST)
+                printk(KERN_ERR "cfq: cic link failed!\n");
+        return ret;
+}
+/*
+ * Setup general io context and cfq io context. There can be several cfq
+ * io contexts per general io context, if this process is doing io to more
+ * than one device managed by cfq.
+ */
+static struct cfq_io_context *
+cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
+{
+        struct io_context *ioc = NULL;
+        struct cfq_io_context *cic;
+        int ret;
+        might_sleep_if(gfp_mask & __GFP_WAIT);
+        ioc = get_io_context(gfp_mask, cfqd->queue->node);
+        if (!ioc)
+                return NULL;
+retry:
+        cic = cfq_cic_lookup(cfqd, ioc);
+        if (cic)
+                goto out;
+        cic = cfq_alloc_io_context(cfqd, gfp_mask);
+        if (cic == NULL)
+                goto err;
+        ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
+        if (ret == -EEXIST) {
+                /* someone has linked cic to ioc already */
+                cfq_cic_free(cic);
+                goto retry;
+        } else if (ret)
+                goto err_free;
+out:
+        smp_read_barrier_depends();
+        if (unlikely(ioc->ioprio_changed))
+                cfq_ioc_set_ioprio(ioc);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+        if (unlikely(ioc->cgroup_changed))
+                cfq_ioc_set_cgroup(ioc);
+#endif
+        return cic;
+err_free:
+        cfq_cic_free(cic);
+err:
+        put_io_context(ioc);
+        return NULL;
+}
 static void
 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 {
@@ -3288,7 +3255,7 @@ __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-                        struct cfq_io_cq *cic)
+        struct cfq_io_context *cic)
 {
        if (cfq_cfqq_sync(cfqq)) {
                __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
@@ -3326,7 +3293,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 */
 static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-                       struct cfq_io_cq *cic)
+                       struct cfq_io_context *cic)
 {
        int old_idle, enable_idle;
@@ -3343,9 +3310,8 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
                enable_idle = 0;
-        else if (!atomic_read(&cic->icq.ioc->active_ref) ||
+        else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-                 !cfqd->cfq_slice_idle ||
+            (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
-                 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
                enable_idle = 0;
        else if (sample_valid(cic->ttime.ttime_samples)) {
                if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
@@ -3445,7 +3411,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-        enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
+        struct cfq_queue *old_cfqq = cfqd->active_queue;
        cfq_log_cfqq(cfqd, cfqq, "preempt");
        cfq_slice_expired(cfqd, 1);
@@ -3454,7 +3420,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         * workload type is changed, don't save slice, otherwise preempt
         * doesn't happen
         */
-        if (old_type != cfqq_type(cfqq))
+        if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
                cfqq->cfqg->saved_workload_slice = 0;
        /*
@@ -3477,7 +3443,7 @@ static void
 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                struct request *rq)
 {
-        struct cfq_io_cq *cic = RQ_CIC(rq);
+        struct cfq_io_context *cic = RQ_CIC(rq);
        cfqd->rq_queued++;
        if (rq->cmd_flags & REQ_PRIO)
@@ -3507,7 +3473,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                                cfq_clear_cfqq_wait_request(cfqq);
                                __blk_run_queue(cfqd->queue);
                        } else {
-                                cfqg_stats_update_idle_time(cfqq->cfqg);
+                                cfq_blkiocg_update_idle_time_stats(
+                                                &cfqq->cfqg->blkg);
                                cfq_mark_cfqq_must_dispatch(cfqq);
                        }
                }
@@ -3529,13 +3496,14 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
        struct cfq_queue *cfqq = RQ_CFQQ(rq);
        cfq_log_cfqq(cfqd, cfqq, "insert_request");
-        cfq_init_prio_data(cfqq, RQ_CIC(rq));
+        cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
        rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
        list_add_tail(&rq->queuelist, &cfqq->fifo);
        cfq_add_rq_rb(rq);
-        cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
+        cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
-                                 rq->cmd_flags);
+                        &cfqd->serving_group->blkg, rq_data_dir(rq),
+                        rq_is_sync(rq));
        cfq_rq_enqueued(cfqd, cfqq, rq);
 }
@@ -3578,7 +3546,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-        struct cfq_io_cq *cic = cfqd->active_cic;
+        struct cfq_io_context *cic = cfqd->active_cic;
        /* If the queue already has requests, don't wait */
        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3631,8 +3599,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
        cfqd->rq_in_driver--;
        cfqq->dispatched--;
        (RQ_CFQG(rq))->dispatched--;
-        cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
+        cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
-                                     rq_io_start_time_ns(rq), rq->cmd_flags);
+                        rq_start_time_ns(rq), rq_io_start_time_ns(rq),
+                        rq_data_dir(rq), rq_is_sync(rq));
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
@@ -3714,7 +3683,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
        struct task_struct *tsk = current;
-        struct cfq_io_cq *cic;
+        struct cfq_io_context *cic;
        struct cfq_queue *cfqq;
        /*
@@ -3729,7 +3698,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
        cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
        if (cfqq) {
-                cfq_init_prio_data(cfqq, cic);
+                cfq_init_prio_data(cfqq, cic->ioc);
                return __cfq_may_queue(cfqq);
        }
@@ -3750,17 +3719,21 @@ static void cfq_put_request(struct request *rq)
                BUG_ON(!cfqq->allocated[rw]);
                cfqq->allocated[rw]--;
+                put_io_context(RQ_CIC(rq)->ioc);
+                rq->elevator_private[0] = NULL;
+                rq->elevator_private[1] = NULL;
                /* Put down rq reference on cfqg */
-                cfqg_put(RQ_CFQG(rq));
+                cfq_put_cfqg(RQ_CFQG(rq));
-                rq->elv.priv[0] = NULL;
+                rq->elevator_private[2] = NULL;
-                rq->elv.priv[1] = NULL;
                cfq_put_queue(cfqq);
        }
 }
 static struct cfq_queue *
-cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
                struct cfq_queue *cfqq)
 {
        cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
@@ -3775,7 +3748,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
 * was the last process referring to said cfqq.
 */
 static struct cfq_queue *
-split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
+split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
 {
        if (cfqq_process_refs(cfqq) == 1) {
                cfqq->pid = current->pid;
@@ -3795,25 +3768,28 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
 * Allocate cfq data structures associated with this request.
 */
 static int
-cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
+cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
-                gfp_t gfp_mask)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-        struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
+        struct cfq_io_context *cic;
        const int rw = rq_data_dir(rq);
        const bool is_sync = rq_is_sync(rq);
        struct cfq_queue *cfqq;
+        unsigned long flags;
        might_sleep_if(gfp_mask & __GFP_WAIT);
-        spin_lock_irq(q->queue_lock);
+        cic = cfq_get_io_context(cfqd, gfp_mask);
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (!cic)
+                goto queue_fail;
-        check_ioprio_changed(cic, bio);
-        check_blkcg_changed(cic, bio);
 new_queue:
        cfqq = cic_to_cfqq(cic, is_sync);
        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-                cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+                cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
                cic_set_cfqq(cic, cfqq, is_sync);
        } else {
                /*
@@ -3839,11 +3815,17 @@ new_queue:
        cfqq->allocated[rw]++;
        cfqq->ref++;
-        cfqg_get(cfqq->cfqg);
+        rq->elevator_private[0] = cic;
-        rq->elv.priv[0] = cfqq;
+        rq->elevator_private[1] = cfqq;
-        rq->elv.priv[1] = cfqq->cfqg;
+        rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
-        spin_unlock_irq(q->queue_lock);
+        spin_unlock_irqrestore(q->queue_lock, flags);
        return 0;
+queue_fail:
+        cfq_schedule_dispatch(cfqd);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        cfq_log(cfqd, "set_request fail");
+        return 1;
 }
 static void cfq_kick_queue(struct work_struct *work)
@@ -3938,6 +3920,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 {
        struct cfq_data *cfqd = e->elevator_data;
        struct request_queue *q = cfqd->queue;
+        bool wait = false;
        cfq_shutdown_timer_wq(cfqd);
@@ -3946,54 +3929,139 @@ static void cfq_exit_queue(struct elevator_queue *e)
        if (cfqd->active_queue)
                __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
+        while (!list_empty(&cfqd->cic_list)) {
+                struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
+                                                        struct cfq_io_context,
+                                                        queue_list);
+                __cfq_exit_single_io_context(cfqd, cic);
+        }
        cfq_put_async_queues(cfqd);
+        cfq_release_cfq_groups(cfqd);
+        /*
+         * If there are groups which we could not unlink from blkcg list,
+         * wait for a rcu period for them to be freed.
+         */
+        if (cfqd->nr_blkcg_linked_grps)
+                wait = true;
        spin_unlock_irq(q->queue_lock);
        cfq_shutdown_timer_wq(cfqd);
+        spin_lock(&cic_index_lock);
+        ida_remove(&cic_index_ida, cfqd->cic_index);
+        spin_unlock(&cic_index_lock);
+        /*
+         * Wait for cfqg->blkg->key accessors to exit their grace periods.
+         * Do this wait only if there are other unlinked groups out
+         * there. This can happen if cgroup deletion path claimed the
+         * responsibility of cleaning up a group before queue cleanup code
+         * get to the group.
+         *
+         * Do not call synchronize_rcu() unconditionally as there are drivers
+         * which create/delete request queue hundreds of times during scan/boot
+         * and synchronize_rcu() can take significant time and slow down boot.
+         */
+        if (wait)
+                synchronize_rcu();
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-        blkcg_deactivate_policy(q, &blkcg_policy_cfq);
+        /* Free up per cpu stats for root group */
-#else
+        free_percpu(cfqd->root_group.blkg.stats_cpu);
-        kfree(cfqd->root_group);
 #endif
        kfree(cfqd);
 }
-static int cfq_init_queue(struct request_queue *q)
+static int cfq_alloc_cic_index(void)
+{
+        int index, error;
+        do {
+                if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
+                        return -ENOMEM;
+                spin_lock(&cic_index_lock);
+                error = ida_get_new(&cic_index_ida, &index);
+                spin_unlock(&cic_index_lock);
+                if (error && error != -EAGAIN)
+                        return error;
+        } while (error);
+        return index;
+}
+static void *cfq_init_queue(struct request_queue *q)
 {
        struct cfq_data *cfqd;
-        struct blkcg_gq *blkg __maybe_unused;
+        int i, j;
-        int i, ret;
+        struct cfq_group *cfqg;
+        struct cfq_rb_root *st;
+        i = cfq_alloc_cic_index();
+        if (i < 0)
+                return NULL;
        cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-        if (!cfqd)
+        if (!cfqd) {
-                return -ENOMEM;
+                spin_lock(&cic_index_lock);
+                ida_remove(&cic_index_ida, i);
+                spin_unlock(&cic_index_lock);
+                return NULL;
+        }
-        cfqd->queue = q;
+        /*
-        q->elevator->elevator_data = cfqd;
+         * Don't need take queue_lock in the routine, since we are
+         * initializing the ioscheduler, and nobody is using cfqd
+         */
+        cfqd->cic_index = i;
        /* Init root service tree */
        cfqd->grp_service_tree = CFQ_RB_ROOT;
-        /* Init root group and prefer root group over other groups by default */
+        /* Init root group */
+        cfqg = &cfqd->root_group;
+        for_each_cfqg_st(cfqg, i, j, st)
+                *st = CFQ_RB_ROOT;
+        RB_CLEAR_NODE(&cfqg->rb_node);
+        /* Give preference to root group over other groups */
+        cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-        ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
+        /*
-        if (ret)
+         * Set root group reference to 2. One reference will be dropped when
-                goto out_free;
+         * all groups on cfqd->cfqg_list are being deleted during queue exit.
+         * Other reference will remain there as we don't want to delete this
+         * group as it is statically allocated and gets destroyed when
+         * throtl_data goes away.
+         */
+        cfqg->ref = 2;
-        cfqd->root_group = blkg_to_cfqg(q->root_blkg);
+        if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
-#else
+                kfree(cfqg);
-        ret = -ENOMEM;
-        cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
-                                        GFP_KERNEL, cfqd->queue->node);
-        if (!cfqd->root_group)
-                goto out_free;
-        cfq_init_cfqg_base(cfqd->root_group);
+                spin_lock(&cic_index_lock);
-#endif
+                ida_remove(&cic_index_ida, cfqd->cic_index);
-        cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
+                spin_unlock(&cic_index_lock);
+                kfree(cfqd);
+                return NULL;
+        }
+        rcu_read_lock();
+        cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
+                                        (void *)cfqd, 0);
+        rcu_read_unlock();
+        cfqd->nr_blkcg_linked_grps++;
+        /* Add group on cfqd->cfqg_list */
+        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+#endif
        /*
         * Not strictly needed (since RB_ROOT just clears the node and we
         * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -4005,17 +4073,15 @@ static int cfq_init_queue(struct request_queue *q)
        /*
         * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
         * Grab a permanent reference to it, so that the normal code flow
-         * will not attempt to free it.  oom_cfqq is linked to root_group
+         * will not attempt to free it.
-         * but shouldn't hold a reference as it'll never be unlinked.  Lose
-         * the reference from linking right away.
         */
        cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
        cfqd->oom_cfqq.ref++;
+        cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
-        spin_lock_irq(q->queue_lock);
+        INIT_LIST_HEAD(&cfqd->cic_list);
-        cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
-        cfqg_put(cfqd->root_group);
+        cfqd->queue = q;
-        spin_unlock_irq(q->queue_lock);
        init_timer(&cfqd->idle_slice_timer);
        cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
@@ -4030,7 +4096,6 @@ static int cfq_init_queue(struct request_queue *q)
        cfqd->cfq_back_penalty = cfq_back_penalty;
        cfqd->cfq_slice[0] = cfq_slice_async;
        cfqd->cfq_slice[1] = cfq_slice_sync;
-        cfqd->cfq_target_latency = cfq_target_latency;
        cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
        cfqd->cfq_slice_idle = cfq_slice_idle;
        cfqd->cfq_group_idle = cfq_group_idle;
@@ -4041,11 +4106,35 @@ static int cfq_init_queue(struct request_queue *q)
         * second, in order to have larger depth for async operations.
         */
        cfqd->last_delayed_sync = jiffies - HZ;
-        return 0;
+        return cfqd;
+}
-out_free:
+static void cfq_slab_kill(void)
-        kfree(cfqd);
+{
-        return ret;
+        /*
+         * Caller already ensured that pending RCU callbacks are completed,
+         * so we should have no busy allocations at this point.
+         */
+        if (cfq_pool)
+                kmem_cache_destroy(cfq_pool);
+        if (cfq_ioc_pool)
+                kmem_cache_destroy(cfq_ioc_pool);
+}
+static int __init cfq_slab_setup(void)
+{
+        cfq_pool = KMEM_CACHE(cfq_queue, 0);
+        if (!cfq_pool)
+                goto fail;
+        cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
+        if (!cfq_ioc_pool)
+                goto fail;
+        return 0;
+fail:
+        cfq_slab_kill();
+        return -ENOMEM;
 }
 /*
@@ -4086,7 +4175,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
-SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
 #undef SHOW_FUNCTION
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
@@ -4120,7 +4208,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
                UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
-STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
 #undef STORE_FUNCTION
 #define CFQ_ATTR(name) \
@@ -4138,7 +4225,6 @@ static struct elv_fs_entry cfq_attrs[] = {
        CFQ_ATTR(slice_idle),
        CFQ_ATTR(group_idle),
        CFQ_ATTR(low_latency),
-        CFQ_ATTR(target_latency),
        __ATTR_NULL
 };
@@ -4156,35 +4242,32 @@ static struct elevator_type iosched_cfq = {
                .elevator_completed_req_fn =    cfq_completed_request,
                .elevator_former_req_fn =       elv_rb_former_request,
                .elevator_latter_req_fn =       elv_rb_latter_request,
-                .elevator_init_icq_fn =         cfq_init_icq,
-                .elevator_exit_icq_fn =         cfq_exit_icq,
                .elevator_set_req_fn =          cfq_set_request,
                .elevator_put_req_fn =          cfq_put_request,
                .elevator_may_queue_fn =        cfq_may_queue,
                .elevator_init_fn =             cfq_init_queue,
                .elevator_exit_fn =             cfq_exit_queue,
+                .trim =                         cfq_free_io_context,
        },
-        .icq_size       =       sizeof(struct cfq_io_cq),
-        .icq_align      =       __alignof__(struct cfq_io_cq),
        .elevator_attrs =       cfq_attrs,
-        .elevator_name  =       "cfq",
+        .elevator_name =        "cfq",
        .elevator_owner =       THIS_MODULE,
 };
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static struct blkcg_policy blkcg_policy_cfq = {
+static struct blkio_policy_type blkio_policy_cfq = {
-        .pd_size                = sizeof(struct cfq_group),
+        .ops = {
-        .cftypes                = cfq_blkcg_files,
+                .blkio_unlink_group_fn =        cfq_unlink_blkio_group,
+                .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
-        .pd_init_fn             = cfq_pd_init,
+        },
-        .pd_reset_stats_fn      = cfq_pd_reset_stats,
+        .plid = BLKIO_POLICY_PROP,
 };
+#else
+static struct blkio_policy_type blkio_policy_cfq;
 #endif
 static int __init cfq_init(void)
 {
-        int ret;
        /*
         * could be 0 on HZ < 1000 setups
         */
@@ -4196,41 +4279,35 @@ static int __init cfq_init(void)
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
        if (!cfq_group_idle)
                cfq_group_idle = 1;
-        ret = blkcg_policy_register(&blkcg_policy_cfq);
-        if (ret)
-                return ret;
 #else
-        cfq_group_idle = 0;
+                cfq_group_idle = 0;
 #endif
+        if (cfq_slab_setup())
+                return -ENOMEM;
-        ret = -ENOMEM;
+        elv_register(&iosched_cfq);
-        cfq_pool = KMEM_CACHE(cfq_queue, 0);
+        blkio_policy_register(&blkio_policy_cfq);
-        if (!cfq_pool)
-                goto err_pol_unreg;
-        ret = elv_register(&iosched_cfq);
-        if (ret)
-                goto err_free_pool;
        return 0;
-err_free_pool:
-        kmem_cache_destroy(cfq_pool);
-err_pol_unreg:
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-        blkcg_policy_unregister(&blkcg_policy_cfq);
-#endif
-        return ret;
 }
 static void __exit cfq_exit(void)
 {
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
+        DECLARE_COMPLETION_ONSTACK(all_gone);
-        blkcg_policy_unregister(&blkcg_policy_cfq);
+        blkio_policy_unregister(&blkio_policy_cfq);
-#endif
        elv_unregister(&iosched_cfq);
-        kmem_cache_destroy(cfq_pool);
+        ioc_gone = &all_gone;
+        /* ioc_gone's update must be visible before reading ioc_count */
+        smp_wmb();
+        /*
+         * this also protects us from entering cfq_slab_kill() with
+         * pending RCU callbacks
+         */
+        if (elv_ioc_count_read(cfq_ioc_count))
+                wait_for_completion(&all_gone);
+        ida_destroy(&cic_index_ida);
+        cfq_slab_kill();
 }
 module_init(cfq_init);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7c668c8a6f9..7b725020823 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -719,9 +719,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case BLKSECTGET:
                return compat_put_ushort(arg,
                                         queue_max_sectors(bdev_get_queue(bdev)));
-        case BLKROTATIONAL:
-                return compat_put_ushort(arg,
-                                         !blk_queue_nonrot(bdev_get_queue(bdev)));
        case BLKRASET: /* compatible, but no compat_ptr (!) */
        case BLKFRASET:
                if (!capable(CAP_SYS_ADMIN))
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 90037b5eb17..c644137d9cd 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -230,7 +230,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
        /*
         * rq is expired!
         */
-        if (time_after_eq(jiffies, rq_fifo_time(rq)))
+        if (time_after(jiffies, rq_fifo_time(rq)))
                return 1;
        return 0;
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e)
 /*
 * initialize elevator private data (deadline_data).
 */
-static int deadline_init_queue(struct request_queue *q)
+static void *deadline_init_queue(struct request_queue *q)
 {
        struct deadline_data *dd;
        dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
        if (!dd)
-                return -ENOMEM;
+                return NULL;
        INIT_LIST_HEAD(&dd->fifo_list[READ]);
        INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -354,9 +354,7 @@ static int deadline_init_queue(struct request_queue *q)
        dd->writes_starved = writes_starved;
        dd->front_merges = 1;
        dd->fifo_batch = fifo_batch;
+        return dd;
-        q->elevator->elevator_data = dd;
-        return 0;
 }
 /*
@@ -450,7 +448,9 @@ static struct elevator_type iosched_deadline = {
 static int __init deadline_init(void)
 {
-        return elv_register(&iosched_deadline);
+        elv_register(&iosched_deadline);
+        return 0;
 }
 static void __exit deadline_exit(void)
diff --git a/block/elevator.c b/block/elevator.c
index 9edba1b8323..a3b64bc71d8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -31,6 +31,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
+#include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
@@ -38,7 +39,6 @@
 #include <trace/events/block.h>
 #include "blk.h"
-#include "blk-cgroup.h"
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -62,8 +62,8 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_allow_merge_fn)
+        if (e->ops->elevator_allow_merge_fn)
-                return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
+                return e->ops->elevator_allow_merge_fn(q, rq, bio);
        return 1;
 }
@@ -71,9 +71,39 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 /*
 * can we safely merge with this request?
 */
-bool elv_rq_merge_ok(struct request *rq, struct bio *bio)
+int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 {
-        if (!blk_rq_merge_ok(rq, bio))
+        if (!rq_mergeable(rq))
+                return 0;
+        /*
+         * Don't merge file system requests and discard requests
+         */
+        if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
+                return 0;
+        /*
+         * Don't merge discard requests and secure discard requests
+         */
+        if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
+                return 0;
+        /*
+         * different data direction or already started, don't merge
+         */
+        if (bio_data_dir(bio) != rq_data_dir(rq))
+                return 0;
+        /*
+         * must be same device and not a special request
+         */
+        if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
+                return 0;
+        /*
+         * only merge integrity protected bio into ditto rq
+         */
+        if (bio_integrity(bio) != blk_integrity_rq(rq))
                return 0;
        if (!elv_iosched_allow_merge(rq, bio))
@@ -83,6 +113,23 @@ bool elv_rq_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
+int elv_try_merge(struct request *__rq, struct bio *bio)
+{
+        int ret = ELEVATOR_NO_MERGE;
+        /*
+         * we can merge and sequence is ok, check if it's possible
+         */
+        if (elv_rq_merge_ok(__rq, bio)) {
+                if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector)
+                        ret = ELEVATOR_BACK_MERGE;
+                else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector)
+                        ret = ELEVATOR_FRONT_MERGE;
+        }
+        return ret;
+}
 static struct elevator_type *elevator_find(const char *name)
 {
        struct elevator_type *e;
@@ -122,7 +169,20 @@ static struct elevator_type *elevator_get(const char *name)
        return e;
 }
-static char chosen_elevator[ELV_NAME_MAX];
+static void *elevator_init_queue(struct request_queue *q,
+                                 struct elevator_queue *eq)
+{
+        return eq->ops->elevator_init_fn(q);
+}
+static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
+                           void *data)
+{
+        q->elevator = eq;
+        eq->elevator_data = data;
+}
+static char chosen_elevator[16];
 static int __init elevator_setup(char *str)
 {
@@ -148,7 +208,8 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
        if (unlikely(!eq))
                goto err;
-        eq->type = e;
+        eq->ops = &e->ops;
+        eq->elevator_type = e;
        kobject_init(&eq->kobj, &elv_ktype);
        mutex_init(&eq->sysfs_lock);
@@ -172,7 +233,7 @@ static void elevator_release(struct kobject *kobj)
        struct elevator_queue *e;
        e = container_of(kobj, struct elevator_queue, kobj);
-        elevator_put(e->type);
+        elevator_put(e->elevator_type);
        kfree(e->hash);
        kfree(e);
 }
@@ -180,7 +241,8 @@ static void elevator_release(struct kobject *kobj)
 int elevator_init(struct request_queue *q, char *name)
 {
        struct elevator_type *e = NULL;
-        int err;
+        struct elevator_queue *eq;
+        void *data;
        if (unlikely(q->elevator))
                return 0;
@@ -213,16 +275,17 @@ int elevator_init(struct request_queue *q, char *name)
                }
        }
-        q->elevator = elevator_alloc(q, e);
+        eq = elevator_alloc(q, e);
-        if (!q->elevator)
+        if (!eq)
                return -ENOMEM;
-        err = e->ops.elevator_init_fn(q);
+        data = elevator_init_queue(q, eq);
-        if (err) {
+        if (!data) {
-                kobject_put(&q->elevator->kobj);
+                kobject_put(&eq->kobj);
-                return err;
+                return -ENOMEM;
        }
+        elevator_attach(q, eq, data);
        return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -230,8 +293,9 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
        mutex_lock(&e->sysfs_lock);
-        if (e->type->ops.elevator_exit_fn)
+        if (e->ops->elevator_exit_fn)
-                e->type->ops.elevator_exit_fn(e);
+                e->ops->elevator_exit_fn(e);
+        e->ops = NULL;
        mutex_unlock(&e->sysfs_lock);
        kobject_put(&e->kobj);
@@ -421,8 +485,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
        /*
         * First try one-hit cache.
         */
-        if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) {
+        if (q->last_merge) {
-                ret = blk_try_merge(q->last_merge, bio);
+                ret = elv_try_merge(q->last_merge, bio);
                if (ret != ELEVATOR_NO_MERGE) {
                        *req = q->last_merge;
                        return ret;
@@ -441,8 +505,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
                return ELEVATOR_BACK_MERGE;
        }
-        if (e->type->ops.elevator_merge_fn)
+        if (e->ops->elevator_merge_fn)
-                return e->type->ops.elevator_merge_fn(q, req, bio);
+                return e->ops->elevator_merge_fn(q, req, bio);
        return ELEVATOR_NO_MERGE;
 }
@@ -458,7 +522,6 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
                                     struct request *rq)
 {
        struct request *__rq;
-        bool ret;
        if (blk_queue_nomerges(q))
                return false;
@@ -472,29 +535,22 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
        if (blk_queue_noxmerges(q))
                return false;
-        ret = false;
        /*
         * See if our hash lookup can find a potential backmerge.
         */
-        while (1) {
+        __rq = elv_rqhash_find(q, blk_rq_pos(rq));
-                __rq = elv_rqhash_find(q, blk_rq_pos(rq));
+        if (__rq && blk_attempt_req_merge(q, __rq, rq))
-                if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
+                return true;
-                        break;
-                /* The merged request could be merged with others, try again */
-                ret = true;
-                rq = __rq;
-        }
-        return ret;
+        return false;
 }
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_merged_fn)
+        if (e->ops->elevator_merged_fn)
-                e->type->ops.elevator_merged_fn(q, rq, type);
+                e->ops->elevator_merged_fn(q, rq, type);
        if (type == ELEVATOR_BACK_MERGE)
                elv_rqhash_reposition(q, rq);
@@ -508,8 +564,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
        struct elevator_queue *e = q->elevator;
        const int next_sorted = next->cmd_flags & REQ_SORTED;
-        if (next_sorted && e->type->ops.elevator_merge_req_fn)
+        if (next_sorted && e->ops->elevator_merge_req_fn)
-                e->type->ops.elevator_merge_req_fn(q, rq, next);
+                e->ops->elevator_merge_req_fn(q, rq, next);
        elv_rqhash_reposition(q, rq);
@@ -526,8 +582,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_bio_merged_fn)
+        if (e->ops->elevator_bio_merged_fn)
-                e->type->ops.elevator_bio_merged_fn(q, rq, bio);
+                e->ops->elevator_bio_merged_fn(q, rq, bio);
 }
 void elv_requeue_request(struct request_queue *q, struct request *rq)
@@ -550,18 +606,45 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 void elv_drain_elevator(struct request_queue *q)
 {
        static int printed;
+        while (q->elevator->ops->elevator_dispatch_fn(q, 1))
-        lockdep_assert_held(q->queue_lock);
-        while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
                ;
-        if (q->nr_sorted && printed++ < 10) {
+        if (q->nr_sorted == 0)
+                return;
+        if (printed++ < 10) {
                printk(KERN_ERR "%s: forced dispatching is broken "
                       "(nr_sorted=%u), please report this\n",
-                       q->elevator->type->elevator_name, q->nr_sorted);
+                       q->elevator->elevator_type->elevator_name, q->nr_sorted);
+        }
+}
+/*
+ * Call with queue lock held, interrupts disabled
+ */
+void elv_quiesce_start(struct request_queue *q)
+{
+        if (!q->elevator)
+                return;
+        queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
+        /*
+         * make sure we don't have any requests in flight
+         */
+        elv_drain_elevator(q);
+        while (q->rq.elvpriv) {
+                __blk_run_queue(q);
+                spin_unlock_irq(q->queue_lock);
+                msleep(10);
+                spin_lock_irq(q->queue_lock);
+                elv_drain_elevator(q);
        }
 }
+void elv_quiesce_end(struct request_queue *q)
+{
+        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+}
 void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
        trace_block_rq_insert(q, rq);
@@ -570,7 +653,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
        if (rq->cmd_flags & REQ_SOFTBARRIER) {
                /* barriers are scheduling boundary, update end_sector */
-                if (rq->cmd_type == REQ_TYPE_FS) {
+                if (rq->cmd_type == REQ_TYPE_FS ||
+                    (rq->cmd_flags & REQ_DISCARD)) {
                        q->end_sector = rq_end_sector(rq);
                        q->boundary_rq = rq;
                }
@@ -612,7 +696,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
                if (elv_attempt_insert_merge(q, rq))
                        break;
        case ELEVATOR_INSERT_SORT:
-                BUG_ON(rq->cmd_type != REQ_TYPE_FS);
+                BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
+                       !(rq->cmd_flags & REQ_DISCARD));
                rq->cmd_flags |= REQ_SORTED;
                q->nr_sorted++;
                if (rq_mergeable(rq)) {
@@ -626,7 +711,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
                 * rq cannot be accessed after calling
                 * elevator_add_req_fn.
                 */
-                q->elevator->type->ops.elevator_add_req_fn(q, rq);
+                q->elevator->ops->elevator_add_req_fn(q, rq);
                break;
        case ELEVATOR_INSERT_FLUSH:
@@ -655,8 +740,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_latter_req_fn)
+        if (e->ops->elevator_latter_req_fn)
-                return e->type->ops.elevator_latter_req_fn(q, rq);
+                return e->ops->elevator_latter_req_fn(q, rq);
        return NULL;
 }
@@ -664,18 +749,19 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_former_req_fn)
+        if (e->ops->elevator_former_req_fn)
-                return e->type->ops.elevator_former_req_fn(q, rq);
+                return e->ops->elevator_former_req_fn(q, rq);
        return NULL;
 }
-int elv_set_request(struct request_queue *q, struct request *rq,
+int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
-                    struct bio *bio, gfp_t gfp_mask)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_set_req_fn)
+        if (e->ops->elevator_set_req_fn)
-                return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
+                return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
+        rq->elevator_private[0] = NULL;
        return 0;
 }
@@ -683,16 +769,16 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_put_req_fn)
+        if (e->ops->elevator_put_req_fn)
-                e->type->ops.elevator_put_req_fn(rq);
+                e->ops->elevator_put_req_fn(rq);
 }
 int elv_may_queue(struct request_queue *q, int rw)
 {
        struct elevator_queue *e = q->elevator;
-        if (e->type->ops.elevator_may_queue_fn)
+        if (e->ops->elevator_may_queue_fn)
-                return e->type->ops.elevator_may_queue_fn(q, rw);
+                return e->ops->elevator_may_queue_fn(q, rw);
        return ELV_MQUEUE_MAY;
 }
@@ -727,8 +813,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
        if (blk_account_rq(rq)) {
                q->in_flight[rq_is_sync(rq)]--;
                if ((rq->cmd_flags & REQ_SORTED) &&
-                    e->type->ops.elevator_completed_req_fn)
+                    e->ops->elevator_completed_req_fn)
-                        e->type->ops.elevator_completed_req_fn(q, rq);
+                        e->ops->elevator_completed_req_fn(q, rq);
        }
 }
@@ -746,7 +832,7 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
        e = container_of(kobj, struct elevator_queue, kobj);
        mutex_lock(&e->sysfs_lock);
-        error = e->type ? entry->show(e, page) : -ENOENT;
+        error = e->ops ? entry->show(e, page) : -ENOENT;
        mutex_unlock(&e->sysfs_lock);
        return error;
 }
@@ -764,7 +850,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
        e = container_of(kobj, struct elevator_queue, kobj);
        mutex_lock(&e->sysfs_lock);
-        error = e->type ? entry->store(e, page, length) : -ENOENT;
+        error = e->ops ? entry->store(e, page, length) : -ENOENT;
        mutex_unlock(&e->sysfs_lock);
        return error;
 }
@@ -786,7 +872,7 @@ int elv_register_queue(struct request_queue *q)
        error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
        if (!error) {
-                struct elv_fs_entry *attr = e->type->elevator_attrs;
+                struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
                if (attr) {
                        while (attr->attr.name) {
                                if (sysfs_create_file(&e->kobj, &attr->attr))
@@ -801,48 +887,29 @@ int elv_register_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(elv_register_queue);
-void elv_unregister_queue(struct request_queue *q)
+static void __elv_unregister_queue(struct elevator_queue *e)
 {
-        if (q) {
+        kobject_uevent(&e->kobj, KOBJ_REMOVE);
-                struct elevator_queue *e = q->elevator;
+        kobject_del(&e->kobj);
+        e->registered = 0;
+}
-                kobject_uevent(&e->kobj, KOBJ_REMOVE);
+void elv_unregister_queue(struct request_queue *q)
-                kobject_del(&e->kobj);
+{
-                e->registered = 0;
+        if (q)
-        }
+                __elv_unregister_queue(q->elevator);
 }
 EXPORT_SYMBOL(elv_unregister_queue);
-int elv_register(struct elevator_type *e)
+void elv_register(struct elevator_type *e)
 {
        char *def = "";
-        /* create icq_cache if requested */
-        if (e->icq_size) {
-                if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
-                    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
-                        return -EINVAL;
-                snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
-                         "%s_io_cq", e->elevator_name);
-                e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
-                                                 e->icq_align, 0, NULL);
-                if (!e->icq_cache)
-                        return -ENOMEM;
-        }
-        /* register, don't allow duplicate names */
        spin_lock(&elv_list_lock);
-        if (elevator_find(e->elevator_name)) {
+        BUG_ON(elevator_find(e->elevator_name));
-                spin_unlock(&elv_list_lock);
-                if (e->icq_cache)
-                        kmem_cache_destroy(e->icq_cache);
-                return -EBUSY;
-        }
        list_add_tail(&e->list, &elv_list);
        spin_unlock(&elv_list_lock);
-        /* print pretty message */
        if (!strcmp(e->elevator_name, chosen_elevator) ||
                        (!*chosen_elevator &&
                         !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
@@ -850,26 +917,30 @@ int elv_register(struct elevator_type *e)
        printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
                                                                def);
-        return 0;
 }
 EXPORT_SYMBOL_GPL(elv_register);
 void elv_unregister(struct elevator_type *e)
 {
-        /* unregister */
+        struct task_struct *g, *p;
-        spin_lock(&elv_list_lock);
-        list_del_init(&e->list);
-        spin_unlock(&elv_list_lock);
        /*
-         * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
+         * Iterate every thread in the process to remove the io contexts.
-         * sure all RCU operations are complete before proceeding.
         */
-        if (e->icq_cache) {
+        if (e->ops.trim) {
-                rcu_barrier();
+                read_lock(&tasklist_lock);
-                kmem_cache_destroy(e->icq_cache);
+                do_each_thread(g, p) {
-                e->icq_cache = NULL;
+                        task_lock(p);
+                        if (p->io_context)
+                                e->ops.trim(p->io_context);
+                        task_unlock(p);
+                } while_each_thread(g, p);
+                read_unlock(&tasklist_lock);
        }
+        spin_lock(&elv_list_lock);
+        list_del_init(&e->list);
+        spin_unlock(&elv_list_lock);
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
@@ -881,60 +952,73 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
-        struct elevator_queue *old = q->elevator;
+        struct elevator_queue *old_elevator, *e;
-        bool registered = old->registered;
+        void *data;
        int err;
        /*
-         * Turn on BYPASS and drain all requests w/ elevator private data.
+         * Allocate new elevator
-         * Block layer doesn't call into a quiesced elevator - all requests
-         * are directly put on the dispatch list without elevator data
-         * using INSERT_BACK.  All requests have SOFTBARRIER set and no
-         * merge happens either.
         */
-        blk_queue_bypass_start(q);
+        e = elevator_alloc(q, new_e);
+        if (!e)
+                return -ENOMEM;
-        /* unregister and clear all auxiliary data of the old elevator */
+        data = elevator_init_queue(q, e);
-        if (registered)
+        if (!data) {
-                elv_unregister_queue(q);
+                kobject_put(&e->kobj);
+                return -ENOMEM;
+        }
+        /*
+         * Turn on BYPASS and drain all requests w/ elevator private data
+         */
        spin_lock_irq(q->queue_lock);
-        ioc_clear_queue(q);
+        elv_quiesce_start(q);
-        spin_unlock_irq(q->queue_lock);
-        /* allocate, init and register new elevator */
+        /*
-        err = -ENOMEM;
+         * Remember old elevator.
-        q->elevator = elevator_alloc(q, new_e);
+         */
-        if (!q->elevator)
+        old_elevator = q->elevator;
-                goto fail_init;
-        err = new_e->ops.elevator_init_fn(q);
+        /*
-        if (err) {
+         * attach and start new elevator
-                kobject_put(&q->elevator->kobj);
+         */
-                goto fail_init;
+        elevator_attach(q, e, data);
-        }
+        spin_unlock_irq(q->queue_lock);
+        if (old_elevator->registered) {
+                __elv_unregister_queue(old_elevator);
-        if (registered) {
                err = elv_register_queue(q);
                if (err)
                        goto fail_register;
        }
-        /* done, kill the old one and finish */
+        /*
-        elevator_exit(old);
+         * finally exit old elevator and turn off BYPASS.
-        blk_queue_bypass_end(q);
+         */
+        elevator_exit(old_elevator);
+        spin_lock_irq(q->queue_lock);
+        elv_quiesce_end(q);
+        spin_unlock_irq(q->queue_lock);
-        blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+        blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
        return 0;
 fail_register:
-        elevator_exit(q->elevator);
+        /*
-fail_init:
+         * switch failed, exit the new io scheduler and reattach the old
-        /* switch failed, restore and re-register old elevator */
+         * one again (along with re-adding the sysfs dir)
-        q->elevator = old;
+         */
+        elevator_exit(e);
+        q->elevator = old_elevator;
        elv_register_queue(q);
-        blk_queue_bypass_end(q);
+        spin_lock_irq(q->queue_lock);
+        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+        spin_unlock_irq(q->queue_lock);
        return err;
 }
@@ -957,7 +1041,7 @@ int elevator_change(struct request_queue *q, const char *name)
                return -EINVAL;
        }
-        if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+        if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
                elevator_put(e);
                return 0;
        }
@@ -992,7 +1076,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
        if (!q->elevator || !blk_queue_stackable(q))
                return sprintf(name, "none\n");
-        elv = e->type;
+        elv = e->elevator_type;
        spin_lock(&elv_list_lock);
        list_for_each_entry(__e, &elv_list, list) {
diff --git a/block/genhd.c b/block/genhd.c
index 9a289d7c84b..d3834710b95 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/kobj_map.h>
+#include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/log2.h>
@@ -35,7 +36,6 @@ static DEFINE_IDR(ext_devt_idr);
 static struct device_type disk_type;
-static void disk_alloc_events(struct gendisk *disk);
 static void disk_add_events(struct gendisk *disk);
 static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
                part = rcu_dereference(ptbl->part[piter->idx]);
                if (!part)
                        continue;
-                if (!part_nr_sects_read(part) &&
+                if (!part->nr_sects &&
                    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
                    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
                      piter->idx == 0))
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 static inline int sector_in_part(struct hd_struct *part, sector_t sector)
 {
        return part->start_sect <= sector &&
-                sector < part->start_sect + part_nr_sects_read(part);
+                sector < part->start_sect + part->nr_sects;
 }
 /**
@@ -507,7 +507,7 @@ static int exact_lock(dev_t devt, void *data)
        return 0;
 }
-static void register_disk(struct gendisk *disk)
+void register_disk(struct gendisk *disk)
 {
        struct device *ddev = disk_to_dev(disk);
        struct block_device *bdev;
@@ -536,7 +536,7 @@ static void register_disk(struct gendisk *disk)
        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
        /* No minors to use for partitions */
-        if (!disk_part_scan_enabled(disk))
+        if (!disk_partitionable(disk))
                goto exit;
        /* No such device (e.g., media were just removed) */
@@ -602,8 +602,6 @@ void add_disk(struct gendisk *disk)
        disk->major = MAJOR(devt);
        disk->first_minor = MINOR(devt);
-        disk_alloc_events(disk);
        /* Register BDI before referencing it from bdev */
        bdi = &disk->queue->backing_dev_info;
        bdi_register_dev(bdi, disk_devt(disk));
@@ -617,7 +615,7 @@ void add_disk(struct gendisk *disk)
         * Take an extra ref on queue which will be put on disk_release()
         * so that it sticks around as long as @disk is there.
         */
-        WARN_ON_ONCE(!blk_get_queue(disk->queue));
+        WARN_ON_ONCE(blk_get_queue(disk->queue));
        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
                                   "bdi");
@@ -743,6 +741,7 @@ void __init printk_all_partitions(void)
                struct hd_struct *part;
                char name_buf[BDEVNAME_SIZE];
                char devt_buf[BDEVT_SIZE];
+                u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
                /*
                 * Don't show empty devices or things that have been
@@ -761,11 +760,14 @@ void __init printk_all_partitions(void)
                while ((part = disk_part_iter_next(&piter))) {
                        bool is_part0 = part == &disk->part0;
+                        uuid[0] = 0;
+                        if (part->info)
+                                part_unpack_uuid(part->info->uuid, uuid);
                        printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
                               bdevt_str(part_devt(part), devt_buf),
-                               (unsigned long long)part_nr_sects_read(part) >> 1
+                               (unsigned long long)part->nr_sects >> 1,
-                               , disk_name(disk, part->partno, name_buf),
+                               disk_name(disk, part->partno, name_buf), uuid);
-                               part->info ? part->info->uuid : "");
                        if (is_part0) {
                                if (disk->driverfs_dev != NULL &&
                                    disk->driverfs_dev->driver != NULL)
@@ -829,7 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
 static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 {
-        void *p;
+        static void *p;
        p = disk_seqf_start(seqf, pos);
        if (!IS_ERR_OR_NULL(p) && !*pos)
@@ -845,7 +847,7 @@ static int show_partition(struct seq_file *seqf, void *v)
        char buf[BDEVNAME_SIZE];
        /* Don't show non-partitionable removeable devices or empty devices */
-        if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+        if (!get_capacity(sgp) || (!disk_partitionable(sgp) &&
                                   (sgp->flags & GENHD_FL_REMOVABLE)))
                return 0;
        if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
@@ -856,7 +858,7 @@ static int show_partition(struct seq_file *seqf, void *v)
        while ((part = disk_part_iter_next(&piter)))
                seq_printf(seqf, "%4d  %7d %10llu %s\n",
                           MAJOR(part_devt(part)), MINOR(part_devt(part)),
-                           (unsigned long long)part_nr_sects_read(part) >> 1,
+                           (unsigned long long)part->nr_sects >> 1,
                           disk_name(sgp, part->partno, buf));
        disk_part_iter_exit(&piter);
@@ -1103,11 +1105,27 @@ static void disk_release(struct device *dev)
                blk_put_queue(disk->queue);
        kfree(disk);
 }
+static int disk_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+        struct gendisk *disk = dev_to_disk(dev);
+        struct disk_part_iter piter;
+        struct hd_struct *part;
+        int cnt = 0;
+        disk_part_iter_init(&piter, disk, 0);
+        while((part = disk_part_iter_next(&piter)))
+                cnt++;
+        disk_part_iter_exit(&piter);
+        add_uevent_var(env, "NPARTS=%u", cnt);
+        return 0;
+}
 struct class block_class = {
        .name           = "block",
 };
-static char *block_devnode(struct device *dev, umode_t *mode)
+static char *block_devnode(struct device *dev, mode_t *mode)
 {
        struct gendisk *disk = dev_to_disk(dev);
@@ -1121,6 +1139,7 @@ static struct device_type disk_type = {
        .groups         = disk_attr_groups,
        .release        = disk_release,
        .devnode        = block_devnode,
+        .uevent         = disk_uevent,
 };
 #ifdef CONFIG_PROC_FS
@@ -1239,7 +1258,7 @@ EXPORT_SYMBOL(blk_lookup_devt);
 struct gendisk *alloc_disk(int minors)
 {
-        return alloc_disk_node(minors, NUMA_NO_NODE);
+        return alloc_disk_node(minors, -1);
 }
 EXPORT_SYMBOL(alloc_disk);
@@ -1262,16 +1281,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
                }
                disk->part_tbl->part[0] = &disk->part0;
-                /*
-                 * set_capacity() and get_capacity() currently don't use
-                 * seqcounter to read/update the part0->nr_sects. Still init
-                 * the counter as we can read the sectors in IO submission
-                 * patch using seqence counters.
-                 *
-                 * TODO: Ideally set_capacity() and get_capacity() should be
-                 * converted to make use of bd_mutex and sequence counters.
-                 */
-                seqcount_init(&disk->part0.nr_sects_seq);
                hd_ref_init(&disk->part0);
                disk->minors = minors;
@@ -1484,9 +1493,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
        intv = disk_events_poll_jiffies(disk);
        set_timer_slack(&ev->dwork.timer, intv / 4);
        if (check_now)
-                queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
+                queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
        else if (intv)
-                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+                queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
 out_unlock:
        spin_unlock_irqrestore(&ev->lock, flags);
 }
@@ -1528,8 +1537,10 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
        spin_lock_irq(&ev->lock);
        ev->clearing |= mask;
-        if (!ev->block)
+        if (!ev->block) {
-                mod_delayed_work(system_freezable_wq, &ev->dwork, 0);
+                cancel_delayed_work(&ev->dwork);
+                queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+        }
        spin_unlock_irq(&ev->lock);
 }
@@ -1565,7 +1576,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
        /* uncondtionally schedule event check and wait for it to finish */
        disk_block_events(disk);
-        queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
+        queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
        flush_delayed_work(&ev->dwork);
        __disk_unblock_events(disk, false);
@@ -1602,7 +1613,7 @@ static void disk_events_workfn(struct work_struct *work)
        intv = disk_events_poll_jiffies(disk);
        if (!ev->block && intv)
-                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+                queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
        spin_unlock_irq(&ev->lock);
@@ -1740,9 +1751,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
                &disk_events_dfl_poll_msecs, 0644);
 /*
- * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
+ * disk_{add|del|release}_events - initialize and destroy disk_events.
 */
-static void disk_alloc_events(struct gendisk *disk)
+static void disk_add_events(struct gendisk *disk)
 {
        struct disk_events *ev;
@@ -1755,6 +1766,16 @@ static void disk_alloc_events(struct gendisk *disk)
                return;
        }
+        if (sysfs_create_files(&disk_to_dev(disk)->kobj,
+                               disk_events_attrs) < 0) {
+                pr_warn("%s: failed to create sysfs files for events\n",
+                        disk->disk_name);
+                kfree(ev);
+                return;
+        }
+        disk->ev = ev;
        INIT_LIST_HEAD(&ev->node);
        ev->disk = disk;
        spin_lock_init(&ev->lock);
@@ -1763,21 +1784,8 @@ static void disk_alloc_events(struct gendisk *disk)
        ev->poll_msecs = -1;
        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
-        disk->ev = ev;
-}
-static void disk_add_events(struct gendisk *disk)
-{
-        if (!disk->ev)
-                return;
-        /* FIXME: error handling */
-        if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
-                pr_warn("%s: failed to create sysfs files for events\n",
-                        disk->disk_name);
        mutex_lock(&disk_events_mutex);
-        list_add_tail(&disk->ev->node, &disk_events);
+        list_add_tail(&ev->node, &disk_events);
        mutex_unlock(&disk_events_mutex);
        /*
diff --git a/block/ioctl.c b/block/ioctl.c
index a31d91d9bc5..1124cd29726 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,11 +1,10 @@
 #include <linux/capability.h>
 #include <linux/blkdev.h>
-#include <linux/export.h>
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
 #include <linux/backing-dev.h>
-#include <linux/fs.h>
+#include <linux/buffer_head.h>
 #include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
@@ -13,7 +12,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 {
        struct block_device *bdevp;
        struct gendisk *disk;
-        struct hd_struct *part, *lpart;
+        struct hd_struct *part;
        struct blkpg_ioctl_arg a;
        struct blkpg_partition p;
        struct disk_part_iter piter;
@@ -36,12 +35,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
                case BLKPG_ADD_PARTITION:
                        start = p.start >> 9;
                        length = p.length >> 9;
-                        /* check for fit in a hd_struct */
+                        /* check for fit in a hd_struct */ 
-                        if (sizeof(sector_t) == sizeof(long) &&
+                        if (sizeof(sector_t) == sizeof(long) && 
                            sizeof(long long) > sizeof(long)) {
                                long pstart = start, plength = length;
                                if (pstart != start || plength != length
-                                    || pstart < 0 || plength < 0 || partno > 65535)
+                                    || pstart < 0 || plength < 0)
                                        return -EINVAL;
                        }
@@ -92,59 +91,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
                        bdput(bdevp);
                        return 0;
-                case BLKPG_RESIZE_PARTITION:
-                        start = p.start >> 9;
-                        /* new length of partition in bytes */
-                        length = p.length >> 9;
-                        /* check for fit in a hd_struct */
-                        if (sizeof(sector_t) == sizeof(long) &&
-                            sizeof(long long) > sizeof(long)) {
-                                long pstart = start, plength = length;
-                                if (pstart != start || plength != length
-                                    || pstart < 0 || plength < 0)
-                                        return -EINVAL;
-                        }
-                        part = disk_get_part(disk, partno);
-                        if (!part)
-                                return -ENXIO;
-                        bdevp = bdget(part_devt(part));
-                        if (!bdevp) {
-                                disk_put_part(part);
-                                return -ENOMEM;
-                        }
-                        mutex_lock(&bdevp->bd_mutex);
-                        mutex_lock_nested(&bdev->bd_mutex, 1);
-                        if (start != part->start_sect) {
-                                mutex_unlock(&bdevp->bd_mutex);
-                                mutex_unlock(&bdev->bd_mutex);
-                                bdput(bdevp);
-                                disk_put_part(part);
-                                return -EINVAL;
-                        }
-                        /* overlap? */
-                        disk_part_iter_init(&piter, disk,
-                                            DISK_PITER_INCL_EMPTY);
-                        while ((lpart = disk_part_iter_next(&piter))) {
-                                if (lpart->partno != partno &&
-                                   !(start + length <= lpart->start_sect ||
-                                   start >= lpart->start_sect + lpart->nr_sects)
-                                   ) {
-                                        disk_part_iter_exit(&piter);
-                                        mutex_unlock(&bdevp->bd_mutex);
-                                        mutex_unlock(&bdev->bd_mutex);
-                                        bdput(bdevp);
-                                        disk_put_part(part);
-                                        return -EBUSY;
-                                }
-                        }
-                        disk_part_iter_exit(&piter);
-                        part_nr_sects_write(part, (sector_t)length);
-                        i_size_write(bdevp->bd_inode, p.length);
-                        mutex_unlock(&bdevp->bd_mutex);
-                        mutex_unlock(&bdev->bd_mutex);
-                        bdput(bdevp);
-                        disk_put_part(part);
-                        return 0;
                default:
                        return -EINVAL;
        }
@@ -155,7 +101,7 @@ static int blkdev_reread_part(struct block_device *bdev)
        struct gendisk *disk = bdev->bd_disk;
        int res;
-        if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
+        if (!disk_partitionable(disk) || bdev != bdev->bd_contains)
                return -EINVAL;
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
@@ -185,22 +131,6 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
        return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
-static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
-                             uint64_t len)
-{
-        if (start & 511)
-                return -EINVAL;
-        if (len & 511)
-                return -EINVAL;
-        start >>= 9;
-        len >>= 9;
-        if (start + len > (i_size_read(bdev->bd_inode) >> 9))
-                return -EINVAL;
-        return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
-}
 static int put_ushort(unsigned long arg, unsigned short val)
 {
        return put_user(val, (unsigned short __user *)arg);
@@ -249,26 +179,6 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
 /*
- * Is it an unrecognized ioctl? The correct returns are either
- * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
- * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
- * code before returning.
- *
- * Confused drivers sometimes return EINVAL, which is wrong. It
- * means "I understood the ioctl command, but the parameters to
- * it were wrong".
- *
- * We should aim to just fix the broken drivers, the EINVAL case
- * should go away.
- */
-static inline int is_unrecognized_ioctl(int ret)
-{
-        return  ret == -EINVAL ||
-                ret == -ENOTTY ||
-                ret == -ENOIOCTLCMD;
-}
-/*
 * always keep this in sync with compat_blkdev_ioctl()
 */
 int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
@@ -285,7 +195,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                        return -EACCES;
                ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-                if (!is_unrecognized_ioctl(ret))
+                /* -EINVAL to handle old uncorrected drivers */
+                if (ret != -EINVAL && ret != -ENOTTY)
                        return ret;
                fsync_bdev(bdev);
@@ -294,7 +205,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        case BLKROSET:
                ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-                if (!is_unrecognized_ioctl(ret))
+                /* -EINVAL to handle old uncorrected drivers */
+                if (ret != -EINVAL && ret != -ENOTTY)
                        return ret;
                if (!capable(CAP_SYS_ADMIN))
                        return -EACCES;
@@ -316,17 +228,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                return blk_ioctl_discard(bdev, range[0], range[1],
                                         cmd == BLKSECDISCARD);
        }
-        case BLKZEROOUT: {
-                uint64_t range[2];
-                if (!(mode & FMODE_WRITE))
-                        return -EBADF;
-                if (copy_from_user(range, (void __user *)arg, sizeof(range)))
-                        return -EFAULT;
-                return blk_ioctl_zeroout(bdev, range[0], range[1]);
-        }
        case HDIO_GETGEO: {
                struct hd_geometry geo;
@@ -376,8 +277,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                return put_uint(arg, bdev_discard_zeroes_data(bdev));
        case BLKSECTGET:
                return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
-        case BLKROTATIONAL:
-                return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
        case BLKRASET:
        case BLKFRASET:
                if(!capable(CAP_SYS_ADMIN))
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 5d1bf70e33d..06389e9ef96 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,17 +59,15 @@ noop_latter_request(struct request_queue *q, struct request *rq)
        return list_entry(rq->queuelist.next, struct request, queuelist);
 }
-static int noop_init_queue(struct request_queue *q)
+static void *noop_init_queue(struct request_queue *q)
 {
        struct noop_data *nd;
        nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
        if (!nd)
-                return -ENOMEM;
+                return NULL;
        INIT_LIST_HEAD(&nd->queue);
-        q->elevator->elevator_data = nd;
+        return nd;
-        return 0;
 }
 static void noop_exit_queue(struct elevator_queue *e)
@@ -96,7 +94,9 @@ static struct elevator_type elevator_noop = {
 static int __init noop_init(void)
 {
-        return elv_register(&elevator_noop);
+        elv_register(&elevator_noop);
+        return 0;
 }
 static void __exit noop_exit(void)
diff --git a/block/partition-generic.c b/block/partition-generic.c
deleted file mode 100644
index f1d14519cc0..00000000000
--- a/block/partition-generic.c
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- *  Code extracted from drivers/block/genhd.c
- *  Copyright (C) 1991-1998  Linus Torvalds
- *  Re-organised Feb 1998 Russell King
- *
- *  We now have independent partition support from the
- *  block drivers, which allows all the partition code to
- *  be grouped in one location, and it to be mostly self
- *  contained.
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/kmod.h>
-#include <linux/ctype.h>
-#include <linux/genhd.h>
-#include <linux/blktrace_api.h>
-#include "partitions/check.h"
-#ifdef CONFIG_BLK_DEV_MD
-extern void md_autodetect_dev(dev_t dev);
-#endif
- 
-/*
- * disk_name() is used by partition check code and the genhd driver.
- * It formats the devicename of the indicated disk into
- * the supplied buffer (of size at least 32), and returns
- * a pointer to that same buffer (for convenience).
- */
-char *disk_name(struct gendisk *hd, int partno, char *buf)
-{
-        if (!partno)
-                snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
-        else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
-                snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
-        else
-                snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
-        return buf;
-}
-const char *bdevname(struct block_device *bdev, char *buf)
-{
-        return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
-}
-EXPORT_SYMBOL(bdevname);
-/*
- * There's very little reason to use this, you should really
- * have a struct block_device just about everywhere and use
- * bdevname() instead.
- */
-const char *__bdevname(dev_t dev, char *buffer)
-{
-        scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)",
-                                MAJOR(dev), MINOR(dev));
-        return buffer;
-}
-EXPORT_SYMBOL(__bdevname);
-static ssize_t part_partition_show(struct device *dev,
-                                   struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%d\n", p->partno);
-}
-static ssize_t part_start_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
-}
-ssize_t part_size_show(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
-}
-static ssize_t part_ro_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%d\n", p->policy ? 1 : 0);
-}
-static ssize_t part_alignment_offset_show(struct device *dev,
-                                          struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
-}
-static ssize_t part_discard_alignment_show(struct device *dev,
-                                           struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%u\n", p->discard_alignment);
-}
-ssize_t part_stat_show(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        int cpu;
-        cpu = part_stat_lock();
-        part_round_stats(cpu, p);
-        part_stat_unlock();
-        return sprintf(buf,
-                "%8lu %8lu %8llu %8u "
-                "%8lu %8lu %8llu %8u "
-                "%8u %8u %8u"
-                "\n",
-                part_stat_read(p, ios[READ]),
-                part_stat_read(p, merges[READ]),
-                (unsigned long long)part_stat_read(p, sectors[READ]),
-                jiffies_to_msecs(part_stat_read(p, ticks[READ])),
-                part_stat_read(p, ios[WRITE]),
-                part_stat_read(p, merges[WRITE]),
-                (unsigned long long)part_stat_read(p, sectors[WRITE]),
-                jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-                part_in_flight(p),
-                jiffies_to_msecs(part_stat_read(p, io_ticks)),
-                jiffies_to_msecs(part_stat_read(p, time_in_queue)));
-}
-ssize_t part_inflight_show(struct device *dev,
-                        struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
-                atomic_read(&p->in_flight[1]));
-}
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-ssize_t part_fail_show(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%d\n", p->make_it_fail);
-}
-ssize_t part_fail_store(struct device *dev,
-                        struct device_attribute *attr,
-                        const char *buf, size_t count)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        int i;
-        if (count > 0 && sscanf(buf, "%d", &i) > 0)
-                p->make_it_fail = (i == 0) ? 0 : 1;
-        return count;
-}
-#endif
-static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
-static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
-                   NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-static struct device_attribute dev_attr_fail =
-        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
-#endif
-static struct attribute *part_attrs[] = {
-        &dev_attr_partition.attr,
-        &dev_attr_start.attr,
-        &dev_attr_size.attr,
-        &dev_attr_ro.attr,
-        &dev_attr_alignment_offset.attr,
-        &dev_attr_discard_alignment.attr,
-        &dev_attr_stat.attr,
-        &dev_attr_inflight.attr,
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-        &dev_attr_fail.attr,
-#endif
-        NULL
-};
-static struct attribute_group part_attr_group = {
-        .attrs = part_attrs,
-};
-static const struct attribute_group *part_attr_groups[] = {
-        &part_attr_group,
-#ifdef CONFIG_BLK_DEV_IO_TRACE
-        &blk_trace_attr_group,
-#endif
-        NULL
-};
-static void part_release(struct device *dev)
-{
-        struct hd_struct *p = dev_to_part(dev);
-        free_part_stats(p);
-        free_part_info(p);
-        kfree(p);
-}
-struct device_type part_type = {
-        .name           = "partition",
-        .groups         = part_attr_groups,
-        .release        = part_release,
-};
-static void delete_partition_rcu_cb(struct rcu_head *head)
-{
-        struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
-        part->start_sect = 0;
-        part->nr_sects = 0;
-        part_stat_set_all(part, 0);
-        put_device(part_to_dev(part));
-}
-void __delete_partition(struct hd_struct *part)
-{
-        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
-}
-void delete_partition(struct gendisk *disk, int partno)
-{
-        struct disk_part_tbl *ptbl = disk->part_tbl;
-        struct hd_struct *part;
-        if (partno >= ptbl->len)
-                return;
-        part = ptbl->part[partno];
-        if (!part)
-                return;
-        blk_free_devt(part_devt(part));
-        rcu_assign_pointer(ptbl->part[partno], NULL);
-        rcu_assign_pointer(ptbl->last_lookup, NULL);
-        kobject_put(part->holder_dir);
-        device_del(part_to_dev(part));
-        hd_struct_put(part);
-}
-static ssize_t whole_disk_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-        return 0;
-}
-static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
-                   whole_disk_show, NULL);
-struct hd_struct *add_partition(struct gendisk *disk, int partno,
-                                sector_t start, sector_t len, int flags,
-                                struct partition_meta_info *info)
-{
-        struct hd_struct *p;
-        dev_t devt = MKDEV(0, 0);
-        struct device *ddev = disk_to_dev(disk);
-        struct device *pdev;
-        struct disk_part_tbl *ptbl;
-        const char *dname;
-        int err;
-        err = disk_expand_part_tbl(disk, partno);
-        if (err)
-                return ERR_PTR(err);
-        ptbl = disk->part_tbl;
-        if (ptbl->part[partno])
-                return ERR_PTR(-EBUSY);
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
-        if (!p)
-                return ERR_PTR(-EBUSY);
-        if (!init_part_stats(p)) {
-                err = -ENOMEM;
-                goto out_free;
-        }
-        seqcount_init(&p->nr_sects_seq);
-        pdev = part_to_dev(p);
-        p->start_sect = start;
-        p->alignment_offset =
-                queue_limit_alignment_offset(&disk->queue->limits, start);
-        p->discard_alignment =
-                queue_limit_discard_alignment(&disk->queue->limits, start);
-        p->nr_sects = len;
-        p->partno = partno;
-        p->policy = get_disk_ro(disk);
-        if (info) {
-                struct partition_meta_info *pinfo = alloc_part_info(disk);
-                if (!pinfo)
-                        goto out_free_stats;
-                memcpy(pinfo, info, sizeof(*info));
-                p->info = pinfo;
-        }
-        dname = dev_name(ddev);
-        if (isdigit(dname[strlen(dname) - 1]))
-                dev_set_name(pdev, "%sp%d", dname, partno);
-        else
-                dev_set_name(pdev, "%s%d", dname, partno);
-        device_initialize(pdev);
-        pdev->class = &block_class;
-        pdev->type = &part_type;
-        pdev->parent = ddev;
-        err = blk_alloc_devt(p, &devt);
-        if (err)
-                goto out_free_info;
-        pdev->devt = devt;
-        /* delay uevent until 'holders' subdir is created */
-        dev_set_uevent_suppress(pdev, 1);
-        err = device_add(pdev);
-        if (err)
-                goto out_put;
-        err = -ENOMEM;
-        p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
-        if (!p->holder_dir)
-                goto out_del;
-        dev_set_uevent_suppress(pdev, 0);
-        if (flags & ADDPART_FLAG_WHOLEDISK) {
-                err = device_create_file(pdev, &dev_attr_whole_disk);
-                if (err)
-                        goto out_del;
-        }
-        /* everything is up and running, commence */
-        rcu_assign_pointer(ptbl->part[partno], p);
-        /* suppress uevent if the disk suppresses it */
-        if (!dev_get_uevent_suppress(ddev))
-                kobject_uevent(&pdev->kobj, KOBJ_ADD);
-        hd_ref_init(p);
-        return p;
-out_free_info:
-        free_part_info(p);
-out_free_stats:
-        free_part_stats(p);
-out_free:
-        kfree(p);
-        return ERR_PTR(err);
-out_del:
-        kobject_put(p->holder_dir);
-        device_del(pdev);
-out_put:
-        put_device(pdev);
-        blk_free_devt(devt);
-        return ERR_PTR(err);
-}
-static bool disk_unlock_native_capacity(struct gendisk *disk)
-{
-        const struct block_device_operations *bdops = disk->fops;
-        if (bdops->unlock_native_capacity &&
-            !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
-                printk(KERN_CONT "enabling native capacity\n");
-                bdops->unlock_native_capacity(disk);
-                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-                return true;
-        } else {
-                printk(KERN_CONT "truncated\n");
-                return false;
-        }
-}
-static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
-{
-        struct disk_part_iter piter;
-        struct hd_struct *part;
-        int res;
-        if (bdev->bd_part_count)
-                return -EBUSY;
-        res = invalidate_partition(disk, 0);
-        if (res)
-                return res;
-        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
-        while ((part = disk_part_iter_next(&piter)))
-                delete_partition(disk, part->partno);
-        disk_part_iter_exit(&piter);
-        return 0;
-}
-int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
-{
-        struct parsed_partitions *state = NULL;
-        struct hd_struct *part;
-        int p, highest, res;
-rescan:
-        if (state && !IS_ERR(state)) {
-                kfree(state);
-                state = NULL;
-        }
-        res = drop_partitions(disk, bdev);
-        if (res)
-                return res;
-        if (disk->fops->revalidate_disk)
-                disk->fops->revalidate_disk(disk);
-        check_disk_size_change(disk, bdev);
-        bdev->bd_invalidated = 0;
-        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
-                return 0;
-        if (IS_ERR(state)) {
-                /*
-                 * I/O error reading the partition table.  If any
-                 * partition code tried to read beyond EOD, retry
-                 * after unlocking native capacity.
-                 */
-                if (PTR_ERR(state) == -ENOSPC) {
-                        printk(KERN_WARNING "%s: partition table beyond EOD, ",
-                               disk->disk_name);
-                        if (disk_unlock_native_capacity(disk))
-                                goto rescan;
-                }
-                return -EIO;
-        }
-        /*
-         * If any partition code tried to read beyond EOD, try
-         * unlocking native capacity even if partition table is
-         * successfully read as we could be missing some partitions.
-         */
-        if (state->access_beyond_eod) {
-                printk(KERN_WARNING
-                       "%s: partition table partially beyond EOD, ",
-                       disk->disk_name);
-                if (disk_unlock_native_capacity(disk))
-                        goto rescan;
-        }
-        /* tell userspace that the media / partition table may have changed */
-        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
-        /* Detect the highest partition number and preallocate
-         * disk->part_tbl.  This is an optimization and not strictly
-         * necessary.
-         */
-        for (p = 1, highest = 0; p < state->limit; p++)
-                if (state->parts[p].size)
-                        highest = p;
-        disk_expand_part_tbl(disk, highest);
-        /* add partitions */
-        for (p = 1; p < state->limit; p++) {
-                sector_t size, from;
-                struct partition_meta_info *info = NULL;
-                size = state->parts[p].size;
-                if (!size)
-                        continue;
-                from = state->parts[p].from;
-                if (from >= get_capacity(disk)) {
-                        printk(KERN_WARNING
-                               "%s: p%d start %llu is beyond EOD, ",
-                               disk->disk_name, p, (unsigned long long) from);
-                        if (disk_unlock_native_capacity(disk))
-                                goto rescan;
-                        continue;
-                }
-                if (from + size > get_capacity(disk)) {
-                        printk(KERN_WARNING
-                               "%s: p%d size %llu extends beyond EOD, ",
-                               disk->disk_name, p, (unsigned long long) size);
-                        if (disk_unlock_native_capacity(disk)) {
-                                /* free state and restart */
-                                goto rescan;
-                        } else {
-                                /*
-                                 * we can not ignore partitions of broken tables
-                                 * created by for example camera firmware, but
-                                 * we limit them to the end of the disk to avoid
-                                 * creating invalid block devices
-                                 */
-                                size = get_capacity(disk) - from;
-                        }
-                }
-                if (state->parts[p].has_info)
-                        info = &state->parts[p].info;
-                part = add_partition(disk, p, from, size,
-                                     state->parts[p].flags,
-                                     &state->parts[p].info);
-                if (IS_ERR(part)) {
-                        printk(KERN_ERR " %s: p%d could not be added: %ld\n",
-                               disk->disk_name, p, -PTR_ERR(part));
-                        continue;
-                }
-#ifdef CONFIG_BLK_DEV_MD
-                if (state->parts[p].flags & ADDPART_FLAG_RAID)
-                        md_autodetect_dev(part_to_dev(part)->devt);
-#endif
-        }
-        kfree(state);
-        return 0;
-}
-int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
-{
-        int res;
-        if (!bdev->bd_invalidated)
-                return 0;
-        res = drop_partitions(disk, bdev);
-        if (res)
-                return res;
-        set_capacity(disk, 0);
-        check_disk_size_change(disk, bdev);
-        bdev->bd_invalidated = 0;
-        /* tell userspace that the media / partition table may have changed */
-        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
-        return 0;
-}
-unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
-{
-        struct address_space *mapping = bdev->bd_inode->i_mapping;
-        struct page *page;
-        page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-                                 NULL);
-        if (!IS_ERR(page)) {
-                if (PageError(page))
-                        goto fail;
-                p->v = page;
-                return (unsigned char *)page_address(page) +  ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
-fail:
-                page_cache_release(page);
-        }
-        p->v = NULL;
-        return NULL;
-}
-EXPORT_SYMBOL(read_dev_sector);
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
deleted file mode 100644
index 75a54e1adbb..00000000000
--- a/block/partitions/Kconfig
+++ /dev/null
@@ -1,251 +0,0 @@
-#
-# Partition configuration
-#
-config PARTITION_ADVANCED
-        bool "Advanced partition selection"
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned under an operating system running on a different
-          architecture than your Linux system.
-          Note that the answer to this question won't directly affect the
-          kernel: saying N will just cause the configurator to skip all
-          the questions about foreign partitioning schemes.
-          If unsure, say N.
-config ACORN_PARTITION
-        bool "Acorn partition support" if PARTITION_ADVANCED
-        default y if ARCH_ACORN
-        help
-          Support hard disks partitioned under Acorn operating systems.
-config ACORN_PARTITION_CUMANA
-        bool "Cumana partition support" if PARTITION_ADVANCED
-        default y if ARCH_ACORN
-        depends on ACORN_PARTITION
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned using the Cumana interface on Acorn machines.
-config ACORN_PARTITION_EESOX
-        bool "EESOX partition support" if PARTITION_ADVANCED
-        default y if ARCH_ACORN
-        depends on ACORN_PARTITION
-config ACORN_PARTITION_ICS
-        bool "ICS partition support" if PARTITION_ADVANCED
-        default y if ARCH_ACORN
-        depends on ACORN_PARTITION
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned using the ICS interface on Acorn machines.
-config ACORN_PARTITION_ADFS
-        bool "Native filecore partition support" if PARTITION_ADVANCED
-        default y if ARCH_ACORN
-        depends on ACORN_PARTITION
-        help
-          The Acorn Disc Filing System is the standard file system of the
-          RiscOS operating system which runs on Acorn's ARM-based Risc PC
-          systems and the Acorn Archimedes range of machines.  If you say
-          `Y' here, Linux will support disk partitions created under ADFS.
-config ACORN_PARTITION_POWERTEC
-        bool "PowerTec partition support" if PARTITION_ADVANCED
-        default y if ARCH_ACORN
-        depends on ACORN_PARTITION
-        help
-          Support reading partition tables created on Acorn machines using
-          the PowerTec SCSI drive.
-config ACORN_PARTITION_RISCIX
-        bool "RISCiX partition support" if PARTITION_ADVANCED
-        default y if ARCH_ACORN
-        depends on ACORN_PARTITION
-        help
-          Once upon a time, there was a native Unix port for the Acorn series
-          of machines called RISCiX.  If you say 'Y' here, Linux will be able
-          to read disks partitioned under RISCiX.
-config OSF_PARTITION
-        bool "Alpha OSF partition support" if PARTITION_ADVANCED
-        default y if ALPHA
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned on an Alpha machine.
-config AMIGA_PARTITION
-        bool "Amiga partition table support" if PARTITION_ADVANCED
-        default y if (AMIGA || AFFS_FS=y)
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned under AmigaOS.
-config ATARI_PARTITION
-        bool "Atari partition table support" if PARTITION_ADVANCED
-        default y if ATARI
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned under the Atari OS.
-config IBM_PARTITION
-        bool "IBM disk label and partition support"
-        depends on PARTITION_ADVANCED && S390
-        help
-          Say Y here if you would like to be able to read the hard disk
-          partition table format used by IBM DASD disks operating under CMS.
-          Otherwise, say N.
-config MAC_PARTITION
-        bool "Macintosh partition map support" if PARTITION_ADVANCED
-        default y if (MAC || PPC_PMAC)
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned on a Macintosh.
-config MSDOS_PARTITION
-        bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
-        default y
-        help
-          Say Y here.
-config BSD_DISKLABEL
-        bool "BSD disklabel (FreeBSD partition tables) support"
-        depends on PARTITION_ADVANCED && MSDOS_PARTITION
-        help
-          FreeBSD uses its own hard disk partition scheme on your PC. It
-          requires only one entry in the primary partition table of your disk
-          and manages it similarly to DOS extended partitions, putting in its
-          first sector a new partition table in BSD disklabel format. Saying Y
-          here allows you to read these disklabels and further mount FreeBSD
-          partitions from within Linux if you have also said Y to "UFS
-          file system support", above. If you don't know what all this is
-          about, say N.
-config MINIX_SUBPARTITION
-        bool "Minix subpartition support"
-        depends on PARTITION_ADVANCED && MSDOS_PARTITION
-        help
-          Minix 2.0.0/2.0.2 subpartition table support for Linux.
-          Say Y here if you want to mount and use Minix 2.0.0/2.0.2
-          subpartitions.
-config SOLARIS_X86_PARTITION
-        bool "Solaris (x86) partition table support"
-        depends on PARTITION_ADVANCED && MSDOS_PARTITION
-        help
-          Like most systems, Solaris x86 uses its own hard disk partition
-          table format, incompatible with all others. Saying Y here allows you
-          to read these partition tables and further mount Solaris x86
-          partitions from within Linux if you have also said Y to "UFS
-          file system support", above.
-config UNIXWARE_DISKLABEL
-        bool "Unixware slices support"
-        depends on PARTITION_ADVANCED && MSDOS_PARTITION
-        ---help---
-          Like some systems, UnixWare uses its own slice table inside a
-          partition (VTOC - Virtual Table of Contents). Its format is
-          incompatible with all other OSes. Saying Y here allows you to read
-          VTOC and further mount UnixWare partitions read-only from within
-          Linux if you have also said Y to "UFS file system support" or
-          "System V and Coherent file system support", above.
-          This is mainly used to carry data from a UnixWare box to your
-          Linux box via a removable medium like magneto-optical, ZIP or
-          removable IDE drives. Note, however, that a good portable way to
-          transport files and directories between unixes (and even other
-          operating systems) is given by the tar program ("man tar" or
-          preferably "info tar").
-          If you don't know what all this is about, say N.
-config LDM_PARTITION
-        bool "Windows Logical Disk Manager (Dynamic Disk) support"
-        depends on PARTITION_ADVANCED
-        ---help---
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned using Windows 2000's/XP's or Vista's Logical Disk
-          Manager.  They are also known as "Dynamic Disks".
-          Note this driver only supports Dynamic Disks with a protective MBR
-          label, i.e. DOS partition table.  It does not support GPT labelled
-          Dynamic Disks yet as can be created with Vista.
-          Windows 2000 introduced the concept of Dynamic Disks to get around
-          the limitations of the PC's partitioning scheme.  The Logical Disk
-          Manager allows the user to repartition a disk and create spanned,
-          mirrored, striped or RAID volumes, all without the need for
-          rebooting.
-          Normal partitions are now called Basic Disks under Windows 2000, XP,
-          and Vista.
-          For a fuller description read <file:Documentation/ldm.txt>.
-          If unsure, say N.
-config LDM_DEBUG
-        bool "Windows LDM extra logging"
-        depends on LDM_PARTITION
-        help
-          Say Y here if you would like LDM to log verbosely.  This could be
-          helpful if the driver doesn't work as expected and you'd like to
-          report a bug.
-          If unsure, say N.
-config SGI_PARTITION
-        bool "SGI partition support" if PARTITION_ADVANCED
-        default y if DEFAULT_SGI_PARTITION
-        help
-          Say Y here if you would like to be able to read the hard disk
-          partition table format used by SGI machines.
-config ULTRIX_PARTITION
-        bool "Ultrix partition table support" if PARTITION_ADVANCED
-        default y if MACH_DECSTATION
-        help
-          Say Y here if you would like to be able to read the hard disk
-          partition table format used by DEC (now Compaq) Ultrix machines.
-          Otherwise, say N.
-config SUN_PARTITION
-        bool "Sun partition tables support" if PARTITION_ADVANCED
-        default y if (SPARC || SUN3 || SUN3X)
-        ---help---
-          Like most systems, SunOS uses its own hard disk partition table
-          format, incompatible with all others. Saying Y here allows you to
-          read these partition tables and further mount SunOS partitions from
-          within Linux if you have also said Y to "UFS file system support",
-          above. This is mainly used to carry data from a SPARC under SunOS to
-          your Linux box via a removable medium like magneto-optical or ZIP
-          drives; note however that a good portable way to transport files and
-          directories between unixes (and even other operating systems) is
-          given by the tar program ("man tar" or preferably "info tar"). If
-          you don't know what all this is about, say N.
-config KARMA_PARTITION
-        bool "Karma Partition support"
-        depends on PARTITION_ADVANCED
-        help
-          Say Y here if you would like to mount the Rio Karma MP3 player, as it
-          uses a proprietary partition table.
-config EFI_PARTITION
-        bool "EFI GUID Partition support" if PARTITION_ADVANCED
-        default y
-        select CRC32
-        help
-          Say Y here if you would like to use hard disks under Linux which
-          were partitioned using EFI GPT.
-config SYSV68_PARTITION
-        bool "SYSV68 partition table support" if PARTITION_ADVANCED
-        default y if VME
-        help
-          Say Y here if you would like to be able to read the hard disk
-          partition table format used by Motorola Delta machines (using
-          sysv68).
-          Otherwise, say N.
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
deleted file mode 100644
index 03af8eac51d..00000000000
--- a/block/partitions/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-obj-$(CONFIG_BLOCK) := check.o
-obj-$(CONFIG_ACORN_PARTITION) += acorn.o
-obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
-obj-$(CONFIG_ATARI_PARTITION) += atari.o
-obj-$(CONFIG_MAC_PARTITION) += mac.o
-obj-$(CONFIG_LDM_PARTITION) += ldm.o
-obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
-obj-$(CONFIG_OSF_PARTITION) += osf.o
-obj-$(CONFIG_SGI_PARTITION) += sgi.o
-obj-$(CONFIG_SUN_PARTITION) += sun.o
-obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
-obj-$(CONFIG_IBM_PARTITION) += ibm.o
-obj-$(CONFIG_EFI_PARTITION) += efi.o
-obj-$(CONFIG_KARMA_PARTITION) += karma.o
-obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
deleted file mode 100644
index fbeb697374d..00000000000
--- a/block/partitions/acorn.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- *  linux/fs/partitions/acorn.c
- *
- *  Copyright (c) 1996-2000 Russell King.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  Scan ADFS partitions on hard disk drives.  Unfortunately, there
- *  isn't a standard for partitioning drives on Acorn machines, so
- *  every single manufacturer of SCSI and IDE cards created their own
- *  method.
- */
-#include <linux/buffer_head.h>
-#include <linux/adfs_fs.h>
-#include "check.h"
-#include "acorn.h"
-/*
- * Partition types. (Oh for reusability)
- */
-#define PARTITION_RISCIX_MFM    1
-#define PARTITION_RISCIX_SCSI   2
-#define PARTITION_LINUX         9
-#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
-        defined(CONFIG_ACORN_PARTITION_ADFS)
-static struct adfs_discrecord *
-adfs_partition(struct parsed_partitions *state, char *name, char *data,
-               unsigned long first_sector, int slot)
-{
-        struct adfs_discrecord *dr;
-        unsigned int nr_sects;
-        if (adfs_checkbblk(data))
-                return NULL;
-        dr = (struct adfs_discrecord *)(data + 0x1c0);
-        if (dr->disc_size == 0 && dr->disc_size_high == 0)
-                return NULL;
-        nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
-                   (le32_to_cpu(dr->disc_size) >> 9);
-        if (name) {
-                strlcat(state->pp_buf, " [", PAGE_SIZE);
-                strlcat(state->pp_buf, name, PAGE_SIZE);
-                strlcat(state->pp_buf, "]", PAGE_SIZE);
-        }
-        put_partition(state, slot, first_sector, nr_sects);
-        return dr;
-}
-#endif
-#ifdef CONFIG_ACORN_PARTITION_RISCIX
-struct riscix_part {
-        __le32  start;
-        __le32  length;
-        __le32  one;
-        char    name[16];
-};
-struct riscix_record {
-        __le32  magic;
-#define RISCIX_MAGIC    cpu_to_le32(0x4a657320)
-        __le32  date;
-        struct riscix_part part[8];
-};
-#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
-        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int riscix_partition(struct parsed_partitions *state,
-                            unsigned long first_sect, int slot,
-                            unsigned long nr_sects)
-{
-        Sector sect;
-        struct riscix_record *rr;
-        
-        rr = read_part_sector(state, first_sect, &sect);
-        if (!rr)
-                return -1;
-        strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
-        if (rr->magic == RISCIX_MAGIC) {
-                unsigned long size = nr_sects > 2 ? 2 : nr_sects;
-                int part;
-                strlcat(state->pp_buf, " <", PAGE_SIZE);
-                put_partition(state, slot++, first_sect, size);
-                for (part = 0; part < 8; part++) {
-                        if (rr->part[part].one &&
-                            memcmp(rr->part[part].name, "All\0", 4)) {
-                                put_partition(state, slot++,
-                                        le32_to_cpu(rr->part[part].start),
-                                        le32_to_cpu(rr->part[part].length));
-                                strlcat(state->pp_buf, "(", PAGE_SIZE);
-                                strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
-                                strlcat(state->pp_buf, ")", PAGE_SIZE);
-                        }
-                }
-                strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-        } else {
-                put_partition(state, slot++, first_sect, nr_sects);
-        }
-        put_dev_sector(sect);
-        return slot;
-}
-#endif
-#endif
-#define LINUX_NATIVE_MAGIC 0xdeafa1de
-#define LINUX_SWAP_MAGIC   0xdeafab1e
-struct linux_part {
-        __le32 magic;
-        __le32 start_sect;
-        __le32 nr_sects;
-};
-#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
-        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int linux_partition(struct parsed_partitions *state,
-                           unsigned long first_sect, int slot,
-                           unsigned long nr_sects)
-{
-        Sector sect;
-        struct linux_part *linuxp;
-        unsigned long size = nr_sects > 2 ? 2 : nr_sects;
-        strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
-        put_partition(state, slot++, first_sect, size);
-        linuxp = read_part_sector(state, first_sect, &sect);
-        if (!linuxp)
-                return -1;
-        strlcat(state->pp_buf, " <", PAGE_SIZE);
-        while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
-               linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
-                if (slot == state->limit)
-                        break;
-                put_partition(state, slot++, first_sect +
-                                 le32_to_cpu(linuxp->start_sect),
-                                 le32_to_cpu(linuxp->nr_sects));
-                linuxp ++;
-        }
-        strlcat(state->pp_buf, " >", PAGE_SIZE);
-        put_dev_sector(sect);
-        return slot;
-}
-#endif
-#ifdef CONFIG_ACORN_PARTITION_CUMANA
-int adfspart_check_CUMANA(struct parsed_partitions *state)
-{
-        unsigned long first_sector = 0;
-        unsigned int start_blk = 0;
-        Sector sect;
-        unsigned char *data;
-        char *name = "CUMANA/ADFS";
-        int first = 1;
-        int slot = 1;
-        /*
-         * Try Cumana style partitions - sector 6 contains ADFS boot block
-         * with pointer to next 'drive'.
-         *
-         * There are unknowns in this code - is the 'cylinder number' of the
-         * next partition relative to the start of this one - I'm assuming
-         * it is.
-         *
-         * Also, which ID did Cumana use?
-         *
-         * This is totally unfinished, and will require more work to get it
-         * going. Hence it is totally untested.
-         */
-        do {
-                struct adfs_discrecord *dr;
-                unsigned int nr_sects;
-                data = read_part_sector(state, start_blk * 2 + 6, &sect);
-                if (!data)
-                        return -1;
-                if (slot == state->limit)
-                        break;
-                dr = adfs_partition(state, name, data, first_sector, slot++);
-                if (!dr)
-                        break;
-                name = NULL;
-                nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) *
-                           (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) *
-                           dr->secspertrack;
-                if (!nr_sects)
-                        break;
-                first = 0;
-                first_sector += nr_sects;
-                start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9);
-                nr_sects = 0; /* hmm - should be partition size */
-                switch (data[0x1fc] & 15) {
-                case 0: /* No partition / ADFS? */
-                        break;
-#ifdef CONFIG_ACORN_PARTITION_RISCIX
-                case PARTITION_RISCIX_SCSI:
-                        /* RISCiX - we don't know how to find the next one. */
-                        slot = riscix_partition(state, first_sector, slot,
-                                                nr_sects);
-                        break;
-#endif
-                case PARTITION_LINUX:
-                        slot = linux_partition(state, first_sector, slot,
-                                               nr_sects);
-                        break;
-                }
-                put_dev_sector(sect);
-                if (slot == -1)
-                        return -1;
-        } while (1);
-        put_dev_sector(sect);
-        return first ? 0 : 1;
-}
-#endif
-#ifdef CONFIG_ACORN_PARTITION_ADFS
-/*
- * Purpose: allocate ADFS partitions.
- *
- * Params : hd          - pointer to gendisk structure to store partition info.
- *          dev         - device number to access.
- *
- * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok.
- *
- * Alloc  : hda  = whole drive
- *          hda1 = ADFS partition on first drive.
- *          hda2 = non-ADFS partition.
- */
-int adfspart_check_ADFS(struct parsed_partitions *state)
-{
-        unsigned long start_sect, nr_sects, sectscyl, heads;
-        Sector sect;
-        unsigned char *data;
-        struct adfs_discrecord *dr;
-        unsigned char id;
-        int slot = 1;
-        data = read_part_sector(state, 6, &sect);
-        if (!data)
-                return -1;
-        dr = adfs_partition(state, "ADFS", data, 0, slot++);
-        if (!dr) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        heads = dr->heads + ((dr->lowsector >> 6) & 1);
-        sectscyl = dr->secspertrack * heads;
-        start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl;
-        id = data[0x1fc] & 15;
-        put_dev_sector(sect);
-        /*
-         * Work out start of non-adfs partition.
-         */
-        nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
-        if (start_sect) {
-                switch (id) {
-#ifdef CONFIG_ACORN_PARTITION_RISCIX
-                case PARTITION_RISCIX_SCSI:
-                case PARTITION_RISCIX_MFM:
-                        slot = riscix_partition(state, start_sect, slot,
-                                                nr_sects);
-                        break;
-#endif
-                case PARTITION_LINUX:
-                        slot = linux_partition(state, start_sect, slot,
-                                               nr_sects);
-                        break;
-                }
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
-#endif
-#ifdef CONFIG_ACORN_PARTITION_ICS
-struct ics_part {
-        __le32 start;
-        __le32 size;
-};
-static int adfspart_check_ICSLinux(struct parsed_partitions *state,
-                                   unsigned long block)
-{
-        Sector sect;
-        unsigned char *data = read_part_sector(state, block, &sect);
-        int result = 0;
-        if (data) {
-                if (memcmp(data, "LinuxPart", 9) == 0)
-                        result = 1;
-                put_dev_sector(sect);
-        }
-        return result;
-}
-/*
- * Check for a valid ICS partition using the checksum.
- */
-static inline int valid_ics_sector(const unsigned char *data)
-{
-        unsigned long sum;
-        int i;
-        for (i = 0, sum = 0x50617274; i < 508; i++)
-                sum += data[i];
-        sum -= le32_to_cpu(*(__le32 *)(&data[508]));
-        return sum == 0;
-}
-/*
- * Purpose: allocate ICS partitions.
- * Params : hd          - pointer to gendisk structure to store partition info.
- *          dev         - device number to access.
- * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
- * Alloc  : hda  = whole drive
- *          hda1 = ADFS partition 0 on first drive.
- *          hda2 = ADFS partition 1 on first drive.
- *              ..etc..
- */
-int adfspart_check_ICS(struct parsed_partitions *state)
-{
-        const unsigned char *data;
-        const struct ics_part *p;
-        int slot;
-        Sector sect;
-        /*
-         * Try ICS style partitions - sector 0 contains partition info.
-         */
-        data = read_part_sector(state, 0, &sect);
-        if (!data)
-                return -1;
-        if (!valid_ics_sector(data)) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
-        for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
-                u32 start = le32_to_cpu(p->start);
-                s32 size = le32_to_cpu(p->size); /* yes, it's signed. */
-                if (slot == state->limit)
-                        break;
-                /*
-                 * Negative sizes tell the RISC OS ICS driver to ignore
-                 * this partition - in effect it says that this does not
-                 * contain an ADFS filesystem.
-                 */
-                if (size < 0) {
-                        size = -size;
-                        /*
-                         * Our own extension - We use the first sector
-                         * of the partition to identify what type this
-                         * partition is.  We must not make this visible
-                         * to the filesystem.
-                         */
-                        if (size > 1 && adfspart_check_ICSLinux(state, start)) {
-                                start += 1;
-                                size -= 1;
-                        }
-                }
-                if (size)
-                        put_partition(state, slot++, start, size);
-        }
-        put_dev_sector(sect);
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
-#endif
-#ifdef CONFIG_ACORN_PARTITION_POWERTEC
-struct ptec_part {
-        __le32 unused1;
-        __le32 unused2;
-        __le32 start;
-        __le32 size;
-        __le32 unused5;
-        char type[8];
-};
-static inline int valid_ptec_sector(const unsigned char *data)
-{
-        unsigned char checksum = 0x2a;
-        int i;
-        /*
-         * If it looks like a PC/BIOS partition, then it
-         * probably isn't PowerTec.
-         */
-        if (data[510] == 0x55 && data[511] == 0xaa)
-                return 0;
-        for (i = 0; i < 511; i++)
-                checksum += data[i];
-        return checksum == data[511];
-}
-/*
- * Purpose: allocate ICS partitions.
- * Params : hd          - pointer to gendisk structure to store partition info.
- *          dev         - device number to access.
- * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
- * Alloc  : hda  = whole drive
- *          hda1 = ADFS partition 0 on first drive.
- *          hda2 = ADFS partition 1 on first drive.
- *              ..etc..
- */
-int adfspart_check_POWERTEC(struct parsed_partitions *state)
-{
-        Sector sect;
-        const unsigned char *data;
-        const struct ptec_part *p;
-        int slot = 1;
-        int i;
-        data = read_part_sector(state, 0, &sect);
-        if (!data)
-                return -1;
-        if (!valid_ptec_sector(data)) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
-        for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
-                u32 start = le32_to_cpu(p->start);
-                u32 size = le32_to_cpu(p->size);
-                if (size)
-                        put_partition(state, slot++, start, size);
-        }
-        put_dev_sector(sect);
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
-#endif
-#ifdef CONFIG_ACORN_PARTITION_EESOX
-struct eesox_part {
-        char    magic[6];
-        char    name[10];
-        __le32  start;
-        __le32  unused6;
-        __le32  unused7;
-        __le32  unused8;
-};
-/*
- * Guess who created this format?
- */
-static const char eesox_name[] = {
-        'N', 'e', 'i', 'l', ' ',
-        'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' '
-};
-/*
- * EESOX SCSI partition format.
- *
- * This is a goddamned awful partition format.  We don't seem to store
- * the size of the partition in this table, only the start addresses.
- *
- * There are two possibilities where the size comes from:
- *  1. The individual ADFS boot block entries that are placed on the disk.
- *  2. The start address of the next entry.
- */
-int adfspart_check_EESOX(struct parsed_partitions *state)
-{
-        Sector sect;
-        const unsigned char *data;
-        unsigned char buffer[256];
-        struct eesox_part *p;
-        sector_t start = 0;
-        int i, slot = 1;
-        data = read_part_sector(state, 7, &sect);
-        if (!data)
-                return -1;
-        /*
-         * "Decrypt" the partition table.  God knows why...
-         */
-        for (i = 0; i < 256; i++)
-                buffer[i] = data[i] ^ eesox_name[i & 15];
-        put_dev_sector(sect);
-        for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) {
-                sector_t next;
-                if (memcmp(p->magic, "Eesox", 6))
-                        break;
-                next = le32_to_cpu(p->start);
-                if (i)
-                        put_partition(state, slot++, start, next - start);
-                start = next;
-        }
-        if (i != 0) {
-                sector_t size;
-                size = get_capacity(state->bdev->bd_disk);
-                put_partition(state, slot++, start, size - start);
-                strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        }
-        return i ? 1 : 0;
-}
-#endif
diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h
deleted file mode 100644
index ede82852969..00000000000
--- a/block/partitions/acorn.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * linux/fs/partitions/acorn.h
- *
- * Copyright (C) 1996-2001 Russell King.
- *
- *  I _hate_ this partitioning mess - why can't we have one defined
- *  format, and everyone stick to it?
- */
-int adfspart_check_CUMANA(struct parsed_partitions *state);
-int adfspart_check_ADFS(struct parsed_partitions *state);
-int adfspart_check_ICS(struct parsed_partitions *state);
-int adfspart_check_POWERTEC(struct parsed_partitions *state);
-int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
deleted file mode 100644
index 70cbf44a156..00000000000
--- a/block/partitions/amiga.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- *  fs/partitions/amiga.c
- *
- *  Code extracted from drivers/block/genhd.c
- *
- *  Copyright (C) 1991-1998  Linus Torvalds
- *  Re-organised Feb 1998 Russell King
- */
-#include <linux/types.h>
-#include <linux/affs_hardblocks.h>
-#include "check.h"
-#include "amiga.h"
-static __inline__ u32
-checksum_block(__be32 *m, int size)
-{
-        u32 sum = 0;
-        while (size--)
-                sum += be32_to_cpu(*m++);
-        return sum;
-}
-int amiga_partition(struct parsed_partitions *state)
-{
-        Sector sect;
-        unsigned char *data;
-        struct RigidDiskBlock *rdb;
-        struct PartitionBlock *pb;
-        int start_sect, nr_sects, blk, part, res = 0;
-        int blksize = 1;        /* Multiplier for disk block size */
-        int slot = 1;
-        char b[BDEVNAME_SIZE];
-        for (blk = 0; ; blk++, put_dev_sector(sect)) {
-                if (blk == RDB_ALLOCATION_LIMIT)
-                        goto rdb_done;
-                data = read_part_sector(state, blk, &sect);
-                if (!data) {
-                        if (warn_no_part)
-                                printk("Dev %s: unable to read RDB block %d\n",
-                                       bdevname(state->bdev, b), blk);
-                        res = -1;
-                        goto rdb_done;
-                }
-                if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
-                        continue;
-                rdb = (struct RigidDiskBlock *)data;
-                if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0)
-                        break;
-                /* Try again with 0xdc..0xdf zeroed, Windows might have
-                 * trashed it.
-                 */
-                *(__be32 *)(data+0xdc) = 0;
-                if (checksum_block((__be32 *)data,
-                                be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
-                        printk("Warning: Trashed word at 0xd0 in block %d "
-                                "ignored in checksum calculation\n",blk);
-                        break;
-                }
-                printk("Dev %s: RDB in block %d has bad checksum\n",
-                       bdevname(state->bdev, b), blk);
-        }
-        /* blksize is blocks per 512 byte standard block */
-        blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
-        {
-                char tmp[7 + 10 + 1 + 1];
-                /* Be more informative */
-                snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        }
-        blk = be32_to_cpu(rdb->rdb_PartitionList);
-        put_dev_sector(sect);
-        for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
-                blk *= blksize; /* Read in terms partition table understands */
-                data = read_part_sector(state, blk, &sect);
-                if (!data) {
-                        if (warn_no_part)
-                                printk("Dev %s: unable to read partition block %d\n",
-                                       bdevname(state->bdev, b), blk);
-                        res = -1;
-                        goto rdb_done;
-                }
-                pb  = (struct PartitionBlock *)data;
-                blk = be32_to_cpu(pb->pb_Next);
-                if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION))
-                        continue;
-                if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
-                        continue;
-                /* Tell Kernel about it */
-                nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
-                            be32_to_cpu(pb->pb_Environment[9])) *
-                           be32_to_cpu(pb->pb_Environment[3]) *
-                           be32_to_cpu(pb->pb_Environment[5]) *
-                           blksize;
-                if (!nr_sects)
-                        continue;
-                start_sect = be32_to_cpu(pb->pb_Environment[9]) *
-                             be32_to_cpu(pb->pb_Environment[3]) *
-                             be32_to_cpu(pb->pb_Environment[5]) *
-                             blksize;
-                put_partition(state,slot++,start_sect,nr_sects);
-                {
-                        /* Be even more informative to aid mounting */
-                        char dostype[4];
-                        char tmp[42];
-                        __be32 *dt = (__be32 *)dostype;
-                        *dt = pb->pb_Environment[16];
-                        if (dostype[3] < ' ')
-                                snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
-                                        dostype[0], dostype[1],
-                                        dostype[2], dostype[3] + '@' );
-                        else
-                                snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
-                                        dostype[0], dostype[1],
-                                        dostype[2], dostype[3]);
-                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                        snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
-                                be32_to_cpu(pb->pb_Environment[6]),
-                                be32_to_cpu(pb->pb_Environment[4]));
-                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                }
-                res = 1;
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-rdb_done:
-        return res;
-}
diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h
deleted file mode 100644
index d094585cada..00000000000
--- a/block/partitions/amiga.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- *  fs/partitions/amiga.h
- */
-int amiga_partition(struct parsed_partitions *state);
diff --git a/block/partitions/atari.c b/block/partitions/atari.c
deleted file mode 100644
index 9875b05e80a..00000000000
--- a/block/partitions/atari.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- *  fs/partitions/atari.c
- *
- *  Code extracted from drivers/block/genhd.c
- *
- *  Copyright (C) 1991-1998  Linus Torvalds
- *  Re-organised Feb 1998 Russell King
- */
-#include <linux/ctype.h>
-#include "check.h"
-#include "atari.h"
-/* ++guenther: this should be settable by the user ("make config")?.
- */
-#define ICD_PARTS
-/* check if a partition entry looks valid -- Atari format is assumed if at
-   least one of the primary entries is ok this way */
-#define VALID_PARTITION(pi,hdsiz)                                            \
-    (((pi)->flg & 1) &&                                                      \
-     isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \
-     be32_to_cpu((pi)->st) <= (hdsiz) &&                                     \
-     be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz))
-static inline int OK_id(char *s)
-{
-        return  memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 ||
-                memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 ||
-                memcmp (s, "RAW", 3) == 0 ;
-}
-int atari_partition(struct parsed_partitions *state)
-{
-        Sector sect;
-        struct rootsector *rs;
-        struct partition_info *pi;
-        u32 extensect;
-        u32 hd_size;
-        int slot;
-#ifdef ICD_PARTS
-        int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
-#endif
-        rs = read_part_sector(state, 0, &sect);
-        if (!rs)
-                return -1;
-        /* Verify this is an Atari rootsector: */
-        hd_size = state->bdev->bd_inode->i_size >> 9;
-        if (!VALID_PARTITION(&rs->part[0], hd_size) &&
-            !VALID_PARTITION(&rs->part[1], hd_size) &&
-            !VALID_PARTITION(&rs->part[2], hd_size) &&
-            !VALID_PARTITION(&rs->part[3], hd_size)) {
-                /*
-                 * if there's no valid primary partition, assume that no Atari
-                 * format partition table (there's no reliable magic or the like
-                 * :-()
-                 */
-                put_dev_sector(sect);
-                return 0;
-        }
-        pi = &rs->part[0];
-        strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
-        for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
-                struct rootsector *xrs;
-                Sector sect2;
-                ulong partsect;
-                if ( !(pi->flg & 1) )
-                        continue;
-                /* active partition */
-                if (memcmp (pi->id, "XGM", 3) != 0) {
-                        /* we don't care about other id's */
-                        put_partition (state, slot, be32_to_cpu(pi->st),
-                                        be32_to_cpu(pi->siz));
-                        continue;
-                }
-                /* extension partition */
-#ifdef ICD_PARTS
-                part_fmt = 1;
-#endif
-                strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
-                partsect = extensect = be32_to_cpu(pi->st);
-                while (1) {
-                        xrs = read_part_sector(state, partsect, &sect2);
-                        if (!xrs) {
-                                printk (" block %ld read failed\n", partsect);
-                                put_dev_sector(sect);
-                                return -1;
-                        }
-                        /* ++roman: sanity check: bit 0 of flg field must be set */
-                        if (!(xrs->part[0].flg & 1)) {
-                                printk( "\nFirst sub-partition in extended partition is not valid!\n" );
-                                put_dev_sector(sect2);
-                                break;
-                        }
-                        put_partition(state, slot,
-                                   partsect + be32_to_cpu(xrs->part[0].st),
-                                   be32_to_cpu(xrs->part[0].siz));
-                        if (!(xrs->part[1].flg & 1)) {
-                                /* end of linked partition list */
-                                put_dev_sector(sect2);
-                                break;
-                        }
-                        if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) {
-                                printk("\nID of extended partition is not XGM!\n");
-                                put_dev_sector(sect2);
-                                break;
-                        }
-                        partsect = be32_to_cpu(xrs->part[1].st) + extensect;
-                        put_dev_sector(sect2);
-                        if (++slot == state->limit) {
-                                printk( "\nMaximum number of partitions reached!\n" );
-                                break;
-                        }
-                }
-                strlcat(state->pp_buf, " >", PAGE_SIZE);
-        }
-#ifdef ICD_PARTS
-        if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
-                pi = &rs->icdpart[0];
-                /* sanity check: no ICD format if first partition invalid */
-                if (OK_id(pi->id)) {
-                        strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
-                        for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
-                                /* accept only GEM,BGM,RAW,LNX,SWP partitions */
-                                if (!((pi->flg & 1) && OK_id(pi->id)))
-                                        continue;
-                                part_fmt = 2;
-                                put_partition (state, slot,
-                                                be32_to_cpu(pi->st),
-                                                be32_to_cpu(pi->siz));
-                        }
-                        strlcat(state->pp_buf, " >", PAGE_SIZE);
-                }
-        }
-#endif
-        put_dev_sector(sect);
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
diff --git a/block/partitions/atari.h b/block/partitions/atari.h
deleted file mode 100644
index fe2d32a89f3..00000000000
--- a/block/partitions/atari.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- *  fs/partitions/atari.h
- *  Moved by Russell King from:
- *
- * linux/include/linux/atari_rootsec.h
- * definitions for Atari Rootsector layout
- * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de)
- *
- * modified for ICD/Supra partitioning scheme restricted to at most 12
- * partitions
- * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)
- */
-struct partition_info
-{
-  u8 flg;                       /* bit 0: active; bit 7: bootable */
-  char id[3];                   /* "GEM", "BGM", "XGM", or other */
-  __be32 st;                    /* start of partition */
-  __be32 siz;                   /* length of partition */
-};
-struct rootsector
-{
-  char unused[0x156];           /* room for boot code */
-  struct partition_info icdpart[8];     /* info for ICD-partitions 5..12 */
-  char unused2[0xc];
-  u32 hd_siz;                   /* size of disk in blocks */
-  struct partition_info part[4];
-  u32 bsl_st;                   /* start of bad sector list */
-  u32 bsl_cnt;                  /* length of bad sector list */
-  u16 checksum;                 /* checksum for bootable disks */
-} __attribute__((__packed__));
-int atari_partition(struct parsed_partitions *state);
diff --git a/block/partitions/check.c b/block/partitions/check.c
deleted file mode 100644
index bc908672c97..00000000000
--- a/block/partitions/check.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- *  fs/partitions/check.c
- *
- *  Code extracted from drivers/block/genhd.c
- *  Copyright (C) 1991-1998  Linus Torvalds
- *  Re-organised Feb 1998 Russell King
- *
- *  We now have independent partition support from the
- *  block drivers, which allows all the partition code to
- *  be grouped in one location, and it to be mostly self
- *  contained.
- *
- *  Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl}
- */
-#include <linux/slab.h>
-#include <linux/ctype.h>
-#include <linux/genhd.h>
-#include "check.h"
-#include "acorn.h"
-#include "amiga.h"
-#include "atari.h"
-#include "ldm.h"
-#include "mac.h"
-#include "msdos.h"
-#include "osf.h"
-#include "sgi.h"
-#include "sun.h"
-#include "ibm.h"
-#include "ultrix.h"
-#include "efi.h"
-#include "karma.h"
-#include "sysv68.h"
-int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
-static int (*check_part[])(struct parsed_partitions *) = {
-        /*
-         * Probe partition formats with tables at disk address 0
-         * that also have an ADFS boot block at 0xdc0.
-         */
-#ifdef CONFIG_ACORN_PARTITION_ICS
-        adfspart_check_ICS,
-#endif
-#ifdef CONFIG_ACORN_PARTITION_POWERTEC
-        adfspart_check_POWERTEC,
-#endif
-#ifdef CONFIG_ACORN_PARTITION_EESOX
-        adfspart_check_EESOX,
-#endif
-        /*
-         * Now move on to formats that only have partition info at
-         * disk address 0xdc0.  Since these may also have stale
-         * PC/BIOS partition tables, they need to come before
-         * the msdos entry.
-         */
-#ifdef CONFIG_ACORN_PARTITION_CUMANA
-        adfspart_check_CUMANA,
-#endif
-#ifdef CONFIG_ACORN_PARTITION_ADFS
-        adfspart_check_ADFS,
-#endif
-#ifdef CONFIG_EFI_PARTITION
-        efi_partition,          /* this must come before msdos */
-#endif
-#ifdef CONFIG_SGI_PARTITION
-        sgi_partition,
-#endif
-#ifdef CONFIG_LDM_PARTITION
-        ldm_partition,          /* this must come before msdos */
-#endif
-#ifdef CONFIG_MSDOS_PARTITION
-        msdos_partition,
-#endif
-#ifdef CONFIG_OSF_PARTITION
-        osf_partition,
-#endif
-#ifdef CONFIG_SUN_PARTITION
-        sun_partition,
-#endif
-#ifdef CONFIG_AMIGA_PARTITION
-        amiga_partition,
-#endif
-#ifdef CONFIG_ATARI_PARTITION
-        atari_partition,
-#endif
-#ifdef CONFIG_MAC_PARTITION
-        mac_partition,
-#endif
-#ifdef CONFIG_ULTRIX_PARTITION
-        ultrix_partition,
-#endif
-#ifdef CONFIG_IBM_PARTITION
-        ibm_partition,
-#endif
-#ifdef CONFIG_KARMA_PARTITION
-        karma_partition,
-#endif
-#ifdef CONFIG_SYSV68_PARTITION
-        sysv68_partition,
-#endif
-        NULL
-};
-struct parsed_partitions *
-check_partition(struct gendisk *hd, struct block_device *bdev)
-{
-        struct parsed_partitions *state;
-        int i, res, err;
-        state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
-        if (!state)
-                return NULL;
-        state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
-        if (!state->pp_buf) {
-                kfree(state);
-                return NULL;
-        }
-        state->pp_buf[0] = '\0';
-        state->bdev = bdev;
-        disk_name(hd, 0, state->name);
-        snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
-        if (isdigit(state->name[strlen(state->name)-1]))
-                sprintf(state->name, "p");
-        state->limit = disk_max_parts(hd);
-        i = res = err = 0;
-        while (!res && check_part[i]) {
-                memset(&state->parts, 0, sizeof(state->parts));
-                res = check_part[i++](state);
-                if (res < 0) {
-                        /* We have hit an I/O error which we don't report now.
-                        * But record it, and let the others do their job.
-                        */
-                        err = res;
-                        res = 0;
-                }
-        }
-        if (res > 0) {
-                printk(KERN_INFO "%s", state->pp_buf);
-                free_page((unsigned long)state->pp_buf);
-                return state;
-        }
-        if (state->access_beyond_eod)
-                err = -ENOSPC;
-        if (err)
-        /* The partition is unrecognized. So report I/O errors if there were any */
-                res = err;
-        if (!res)
-                strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
-        else if (warn_no_part)
-                strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
-        printk(KERN_INFO "%s", state->pp_buf);
-        free_page((unsigned long)state->pp_buf);
-        kfree(state);
-        return ERR_PTR(res);
-}
diff --git a/block/partitions/check.h b/block/partitions/check.h
deleted file mode 100644
index 52b100311ec..00000000000
--- a/block/partitions/check.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <linux/pagemap.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-/*
- * add_gd_partition adds a partitions details to the devices partition
- * description.
- */
-struct parsed_partitions {
-        struct block_device *bdev;
-        char name[BDEVNAME_SIZE];
-        struct {
-                sector_t from;
-                sector_t size;
-                int flags;
-                bool has_info;
-                struct partition_meta_info info;
-        } parts[DISK_MAX_PARTS];
-        int next;
-        int limit;
-        bool access_beyond_eod;
-        char *pp_buf;
-};
-struct parsed_partitions *
-check_partition(struct gendisk *, struct block_device *);
-static inline void *read_part_sector(struct parsed_partitions *state,
-                                     sector_t n, Sector *p)
-{
-        if (n >= get_capacity(state->bdev->bd_disk)) {
-                state->access_beyond_eod = true;
-                return NULL;
-        }
-        return read_dev_sector(state->bdev, n, p);
-}
-static inline void
-put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
-{
-        if (n < p->limit) {
-                char tmp[1 + BDEVNAME_SIZE + 10 + 1];
-                p->parts[n].from = from;
-                p->parts[n].size = size;
-                snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
-                strlcat(p->pp_buf, tmp, PAGE_SIZE);
-        }
-}
-extern int warn_no_part;
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
deleted file mode 100644
index b62fb88b871..00000000000
--- a/block/partitions/efi.c
+++ /dev/null
@@ -1,670 +0,0 @@
-/************************************************************
- * EFI GUID Partition Table handling
- *
- * http://www.uefi.org/specs/
- * http://www.intel.com/technology/efi/
- *
- * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
- *   Copyright 2000,2001,2002,2004 Dell Inc.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * TODO:
- *
- * Changelog:
- * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
- * - test for valid PMBR and valid PGPT before ever reading
- *   AGPT, allow override with 'gpt' kernel command line option.
- * - check for first/last_usable_lba outside of size of disk
- *
- * Tue  Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com>
- * - Ported to 2.5.7-pre1 and 2.5.7-dj2
- * - Applied patch to avoid fault in alternate header handling
- * - cleaned up find_valid_gpt
- * - On-disk structure and copy in memory is *always* LE now - 
- *   swab fields as needed
- * - remove print_gpt_header()
- * - only use first max_p partition entries, to keep the kernel minor number
- *   and partition numbers tied.
- *
- * Mon  Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com>
- * - Removed __PRIPTR_PREFIX - not being used
- *
- * Mon  Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com>
- * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied
- *
- * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Added compare_gpts().
- * - moved le_efi_guid_to_cpus() back into this file.  GPT is the only
- *   thing that keeps EFI GUIDs on disk.
- * - Changed gpt structure names and members to be simpler and more Linux-like.
- * 
- * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck
- *
- * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Changed function comments to DocBook style per Andreas Dilger suggestion.
- *
- * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Change read_lba() to use the page cache per Al Viro's work.
- * - print u64s properly on all architectures
- * - fixed debug_printk(), now Dprintk()
- *
- * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Style cleanups
- * - made most functions static
- * - Endianness addition
- * - remove test for second alternate header, as it's not per spec,
- *   and is unnecessary.  There's now a method to read/write the last
- *   sector of an odd-sized disk from user space.  No tools have ever
- *   been released which used this code, so it's effectively dead.
- * - Per Asit Mallick of Intel, added a test for a valid PMBR.
- * - Added kernel command line option 'gpt' to override valid PMBR test.
- *
- * Wed Jun  6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com>
- * - added devfs volume UUID support (/dev/volumes/uuids) for
- *   mounting file systems by the partition GUID. 
- *
- * Tue Dec  5 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Moved crc32() to linux/lib, added efi_crc32().
- *
- * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Replaced Intel's CRC32 function with an equivalent
- *   non-license-restricted version.
- *
- * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Fixed the last_lba() call to return the proper last block
- *
- * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Thanks to Andries Brouwer for his debugging assistance.
- * - Code works, detects all the partitions.
- *
- ************************************************************/
-#include <linux/crc32.h>
-#include <linux/ctype.h>
-#include <linux/math64.h>
-#include <linux/slab.h>
-#include "check.h"
-#include "efi.h"
-/* This allows a kernel command line option 'gpt' to override
- * the test for invalid PMBR.  Not __initdata because reloading
- * the partition tables happens after init too.
- */
-static int force_gpt;
-static int __init
-force_gpt_fn(char *str)
-{
-        force_gpt = 1;
-        return 1;
-}
-__setup("gpt", force_gpt_fn);
-/**
- * efi_crc32() - EFI version of crc32 function
- * @buf: buffer to calculate crc32 of
- * @len - length of buf
- *
- * Description: Returns EFI-style CRC32 value for @buf
- * 
- * This function uses the little endian Ethernet polynomial
- * but seeds the function with ~0, and xor's with ~0 at the end.
- * Note, the EFI Specification, v1.02, has a reference to
- * Dr. Dobbs Journal, May 1994 (actually it's in May 1992).
- */
-static inline u32
-efi_crc32(const void *buf, unsigned long len)
-{
-        return (crc32(~0L, buf, len) ^ ~0L);
-}
-/**
- * last_lba(): return number of last logical block of device
- * @bdev: block device
- * 
- * Description: Returns last LBA value on success, 0 on error.
- * This is stored (by sd and ide-geometry) in
- *  the part[0] entry for this disk, and is the number of
- *  physical sectors available on the disk.
- */
-static u64 last_lba(struct block_device *bdev)
-{
-        if (!bdev || !bdev->bd_inode)
-                return 0;
-        return div_u64(bdev->bd_inode->i_size,
-                       bdev_logical_block_size(bdev)) - 1ULL;
-}
-static inline int
-pmbr_part_valid(struct partition *part)
-{
-        if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
-            le32_to_cpu(part->start_sect) == 1UL)
-                return 1;
-        return 0;
-}
-/**
- * is_pmbr_valid(): test Protective MBR for validity
- * @mbr: pointer to a legacy mbr structure
- *
- * Description: Returns 1 if PMBR is valid, 0 otherwise.
- * Validity depends on two things:
- *  1) MSDOS signature is in the last two bytes of the MBR
- *  2) One partition of type 0xEE is found
- */
-static int
-is_pmbr_valid(legacy_mbr *mbr)
-{
-        int i;
-        if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
-                return 0;
-        for (i = 0; i < 4; i++)
-                if (pmbr_part_valid(&mbr->partition_record[i]))
-                        return 1;
-        return 0;
-}
-/**
- * read_lba(): Read bytes from disk, starting at given LBA
- * @state
- * @lba
- * @buffer
- * @size_t
- *
- * Description: Reads @count bytes from @state->bdev into @buffer.
- * Returns number of bytes read on success, 0 on error.
- */
-static size_t read_lba(struct parsed_partitions *state,
-                       u64 lba, u8 *buffer, size_t count)
-{
-        size_t totalreadcount = 0;
-        struct block_device *bdev = state->bdev;
-        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
-        if (!buffer || lba > last_lba(bdev))
-                return 0;
-        while (count) {
-                int copied = 512;
-                Sector sect;
-                unsigned char *data = read_part_sector(state, n++, &sect);
-                if (!data)
-                        break;
-                if (copied > count)
-                        copied = count;
-                memcpy(buffer, data, copied);
-                put_dev_sector(sect);
-                buffer += copied;
-                totalreadcount +=copied;
-                count -= copied;
-        }
-        return totalreadcount;
-}
-/**
- * alloc_read_gpt_entries(): reads partition entries from disk
- * @state
- * @gpt - GPT header
- * 
- * Description: Returns ptes on success,  NULL on error.
- * Allocates space for PTEs based on information found in @gpt.
- * Notes: remember to free pte when you're done!
- */
-static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
-                                         gpt_header *gpt)
-{
-        size_t count;
-        gpt_entry *pte;
-        if (!gpt)
-                return NULL;
-        count = le32_to_cpu(gpt->num_partition_entries) *
-                le32_to_cpu(gpt->sizeof_partition_entry);
-        if (!count)
-                return NULL;
-        pte = kzalloc(count, GFP_KERNEL);
-        if (!pte)
-                return NULL;
-        if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
-                     (u8 *) pte,
-                     count) < count) {
-                kfree(pte);
-                pte=NULL;
-                return NULL;
-        }
-        return pte;
-}
-/**
- * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @state
- * @lba is the Logical Block Address of the partition table
- * 
- * Description: returns GPT header on success, NULL on error.   Allocates
- * and fills a GPT header starting at @ from @state->bdev.
- * Note: remember to free gpt when finished with it.
- */
-static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
-                                         u64 lba)
-{
-        gpt_header *gpt;
-        unsigned ssz = bdev_logical_block_size(state->bdev);
-        gpt = kzalloc(ssz, GFP_KERNEL);
-        if (!gpt)
-                return NULL;
-        if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
-                kfree(gpt);
-                gpt=NULL;
-                return NULL;
-        }
-        return gpt;
-}
-/**
- * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @state
- * @lba is the logical block address of the GPT header to test
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
- *
- * Description: returns 1 if valid,  0 on error.
- * If valid, returns pointers to newly allocated GPT header and PTEs.
- */
-static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
-                        gpt_header **gpt, gpt_entry **ptes)
-{
-        u32 crc, origcrc;
-        u64 lastlba;
-        if (!ptes)
-                return 0;
-        if (!(*gpt = alloc_read_gpt_header(state, lba)))
-                return 0;
-        /* Check the GUID Partition Table signature */
-        if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
-                pr_debug("GUID Partition Table Header signature is wrong:"
-                         "%lld != %lld\n",
-                         (unsigned long long)le64_to_cpu((*gpt)->signature),
-                         (unsigned long long)GPT_HEADER_SIGNATURE);
-                goto fail;
-        }
-        /* Check the GUID Partition Table header size */
-        if (le32_to_cpu((*gpt)->header_size) >
-                        bdev_logical_block_size(state->bdev)) {
-                pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
-                        le32_to_cpu((*gpt)->header_size),
-                        bdev_logical_block_size(state->bdev));
-                goto fail;
-        }
-        /* Check the GUID Partition Table CRC */
-        origcrc = le32_to_cpu((*gpt)->header_crc32);
-        (*gpt)->header_crc32 = 0;
-        crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
-        if (crc != origcrc) {
-                pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
-                         crc, origcrc);
-                goto fail;
-        }
-        (*gpt)->header_crc32 = cpu_to_le32(origcrc);
-        /* Check that the my_lba entry points to the LBA that contains
-         * the GUID Partition Table */
-        if (le64_to_cpu((*gpt)->my_lba) != lba) {
-                pr_debug("GPT my_lba incorrect: %lld != %lld\n",
-                         (unsigned long long)le64_to_cpu((*gpt)->my_lba),
-                         (unsigned long long)lba);
-                goto fail;
-        }
-        /* Check the first_usable_lba and last_usable_lba are
-         * within the disk.
-         */
-        lastlba = last_lba(state->bdev);
-        if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
-                pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
-                         (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
-                         (unsigned long long)lastlba);
-                goto fail;
-        }
-        if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
-                pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
-                         (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
-                         (unsigned long long)lastlba);
-                goto fail;
-        }
-        /* Check that sizeof_partition_entry has the correct value */
-        if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
-                pr_debug("GUID Partitition Entry Size check failed.\n");
-                goto fail;
-        }
-        if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
-                goto fail;
-        /* Check the GUID Partition Entry Array CRC */
-        crc = efi_crc32((const unsigned char *) (*ptes),
-                        le32_to_cpu((*gpt)->num_partition_entries) *
-                        le32_to_cpu((*gpt)->sizeof_partition_entry));
-        if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
-                pr_debug("GUID Partitition Entry Array CRC check failed.\n");
-                goto fail_ptes;
-        }
-        /* We're done, all's well */
-        return 1;
- fail_ptes:
-        kfree(*ptes);
-        *ptes = NULL;
- fail:
-        kfree(*gpt);
-        *gpt = NULL;
-        return 0;
-}
-/**
- * is_pte_valid() - tests one PTE for validity
- * @pte is the pte to check
- * @lastlba is last lba of the disk
- *
- * Description: returns 1 if valid,  0 on error.
- */
-static inline int
-is_pte_valid(const gpt_entry *pte, const u64 lastlba)
-{
-        if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) ||
-            le64_to_cpu(pte->starting_lba) > lastlba         ||
-            le64_to_cpu(pte->ending_lba)   > lastlba)
-                return 0;
-        return 1;
-}
-/**
- * compare_gpts() - Search disk for valid GPT headers and PTEs
- * @pgpt is the primary GPT header
- * @agpt is the alternate GPT header
- * @lastlba is the last LBA number
- * Description: Returns nothing.  Sanity checks pgpt and agpt fields
- * and prints warnings on discrepancies.
- * 
- */
-static void
-compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
-{
-        int error_found = 0;
-        if (!pgpt || !agpt)
-                return;
-        if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
-                printk(KERN_WARNING
-                       "GPT:Primary header LBA != Alt. header alternate_lba\n");
-                printk(KERN_WARNING "GPT:%lld != %lld\n",
-                       (unsigned long long)le64_to_cpu(pgpt->my_lba),
-                       (unsigned long long)le64_to_cpu(agpt->alternate_lba));
-                error_found++;
-        }
-        if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
-                printk(KERN_WARNING
-                       "GPT:Primary header alternate_lba != Alt. header my_lba\n");
-                printk(KERN_WARNING "GPT:%lld != %lld\n",
-                       (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
-                       (unsigned long long)le64_to_cpu(agpt->my_lba));
-                error_found++;
-        }
-        if (le64_to_cpu(pgpt->first_usable_lba) !=
-            le64_to_cpu(agpt->first_usable_lba)) {
-                printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
-                printk(KERN_WARNING "GPT:%lld != %lld\n",
-                       (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
-                       (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
-                error_found++;
-        }
-        if (le64_to_cpu(pgpt->last_usable_lba) !=
-            le64_to_cpu(agpt->last_usable_lba)) {
-                printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
-                printk(KERN_WARNING "GPT:%lld != %lld\n",
-                       (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
-                       (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
-                error_found++;
-        }
-        if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
-                printk(KERN_WARNING "GPT:disk_guids don't match.\n");
-                error_found++;
-        }
-        if (le32_to_cpu(pgpt->num_partition_entries) !=
-            le32_to_cpu(agpt->num_partition_entries)) {
-                printk(KERN_WARNING "GPT:num_partition_entries don't match: "
-                       "0x%x != 0x%x\n",
-                       le32_to_cpu(pgpt->num_partition_entries),
-                       le32_to_cpu(agpt->num_partition_entries));
-                error_found++;
-        }
-        if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
-            le32_to_cpu(agpt->sizeof_partition_entry)) {
-                printk(KERN_WARNING
-                       "GPT:sizeof_partition_entry values don't match: "
-                       "0x%x != 0x%x\n",
-                       le32_to_cpu(pgpt->sizeof_partition_entry),
-                       le32_to_cpu(agpt->sizeof_partition_entry));
-                error_found++;
-        }
-        if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
-            le32_to_cpu(agpt->partition_entry_array_crc32)) {
-                printk(KERN_WARNING
-                       "GPT:partition_entry_array_crc32 values don't match: "
-                       "0x%x != 0x%x\n",
-                       le32_to_cpu(pgpt->partition_entry_array_crc32),
-                       le32_to_cpu(agpt->partition_entry_array_crc32));
-                error_found++;
-        }
-        if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
-                printk(KERN_WARNING
-                       "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
-                printk(KERN_WARNING "GPT:%lld != %lld\n",
-                        (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
-                        (unsigned long long)lastlba);
-                error_found++;
-        }
-        if (le64_to_cpu(agpt->my_lba) != lastlba) {
-                printk(KERN_WARNING
-                       "GPT:Alternate GPT header not at the end of the disk.\n");
-                printk(KERN_WARNING "GPT:%lld != %lld\n",
-                        (unsigned long long)le64_to_cpu(agpt->my_lba),
-                        (unsigned long long)lastlba);
-                error_found++;
-        }
-        if (error_found)
-                printk(KERN_WARNING
-                       "GPT: Use GNU Parted to correct GPT errors.\n");
-        return;
-}
-/**
- * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @state
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
- * Description: Returns 1 if valid, 0 on error.
- * If valid, returns pointers to newly allocated GPT header and PTEs.
- * Validity depends on PMBR being valid (or being overridden by the
- * 'gpt' kernel command line option) and finding either the Primary
- * GPT header and PTEs valid, or the Alternate GPT header and PTEs
- * valid.  If the Primary GPT header is not valid, the Alternate GPT header
- * is not checked unless the 'gpt' kernel command line option is passed.
- * This protects against devices which misreport their size, and forces
- * the user to decide to use the Alternate GPT.
- */
-static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
-                          gpt_entry **ptes)
-{
-        int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
-        gpt_header *pgpt = NULL, *agpt = NULL;
-        gpt_entry *pptes = NULL, *aptes = NULL;
-        legacy_mbr *legacymbr;
-        u64 lastlba;
-        if (!ptes)
-                return 0;
-        lastlba = last_lba(state->bdev);
-        if (!force_gpt) {
-                /* This will be added to the EFI Spec. per Intel after v1.02. */
-                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
-                if (legacymbr) {
-                        read_lba(state, 0, (u8 *) legacymbr,
-                                 sizeof (*legacymbr));
-                        good_pmbr = is_pmbr_valid(legacymbr);
-                        kfree(legacymbr);
-                }
-                if (!good_pmbr)
-                        goto fail;
-        }
-        good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
-                                 &pgpt, &pptes);
-        if (good_pgpt)
-                good_agpt = is_gpt_valid(state,
-                                         le64_to_cpu(pgpt->alternate_lba),
-                                         &agpt, &aptes);
-        if (!good_agpt && force_gpt)
-                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
-        /* The obviously unsuccessful case */
-        if (!good_pgpt && !good_agpt)
-                goto fail;
-        compare_gpts(pgpt, agpt, lastlba);
-        /* The good cases */
-        if (good_pgpt) {
-                *gpt  = pgpt;
-                *ptes = pptes;
-                kfree(agpt);
-                kfree(aptes);
-                if (!good_agpt) {
-                        printk(KERN_WARNING 
-                               "Alternate GPT is invalid, "
-                               "using primary GPT.\n");
-                }
-                return 1;
-        }
-        else if (good_agpt) {
-                *gpt  = agpt;
-                *ptes = aptes;
-                kfree(pgpt);
-                kfree(pptes);
-                printk(KERN_WARNING 
-                       "Primary GPT is invalid, using alternate GPT.\n");
-                return 1;
-        }
- fail:
-        kfree(pgpt);
-        kfree(agpt);
-        kfree(pptes);
-        kfree(aptes);
-        *gpt = NULL;
-        *ptes = NULL;
-        return 0;
-}
-/**
- * efi_partition(struct parsed_partitions *state)
- * @state
- *
- * Description: called from check.c, if the disk contains GPT
- * partitions, sets up partition entries in the kernel.
- *
- * If the first block on the disk is a legacy MBR,
- * it will get handled by msdos_partition().
- * If it's a Protective MBR, we'll handle it here.
- *
- * We do not create a Linux partition for GPT, but
- * only for the actual data partitions.
- * Returns:
- * -1 if unable to read the partition table
- *  0 if this isn't our partition table
- *  1 if successful
- *
- */
-int efi_partition(struct parsed_partitions *state)
-{
-        gpt_header *gpt = NULL;
-        gpt_entry *ptes = NULL;
-        u32 i;
-        unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
-        if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
-                kfree(gpt);
-                kfree(ptes);
-                return 0;
-        }
-        pr_debug("GUID Partition Table is valid!  Yea!\n");
-        for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
-                struct partition_meta_info *info;
-                unsigned label_count = 0;
-                unsigned label_max;
-                u64 start = le64_to_cpu(ptes[i].starting_lba);
-                u64 size = le64_to_cpu(ptes[i].ending_lba) -
-                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
-                if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
-                        continue;
-                put_partition(state, i+1, start * ssz, size * ssz);
-                /* If this is a RAID volume, tell md */
-                if (!efi_guidcmp(ptes[i].partition_type_guid,
-                                 PARTITION_LINUX_RAID_GUID))
-                        state->parts[i + 1].flags = ADDPART_FLAG_RAID;
-                info = &state->parts[i + 1].info;
-                efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
-                /* Naively convert UTF16-LE to 7 bits. */
-                label_max = min(sizeof(info->volname) - 1,
-                                sizeof(ptes[i].partition_name));
-                info->volname[label_max] = 0;
-                while (label_count < label_max) {
-                        u8 c = ptes[i].partition_name[label_count] & 0xff;
-                        if (c && !isprint(c))
-                                c = '!';
-                        info->volname[label_count] = c;
-                        label_count++;
-                }
-                state->parts[i + 1].has_info = true;
-        }
-        kfree(ptes);
-        kfree(gpt);
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
deleted file mode 100644
index b69ab729558..00000000000
--- a/block/partitions/efi.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/************************************************************
- * EFI GUID Partition Table
- * Per Intel EFI Specification v1.02
- * http://developer.intel.com/technology/efi/efi.htm
- *
- * By Matt Domsch <Matt_Domsch@dell.com>  Fri Sep 22 22:15:56 CDT 2000  
- *   Copyright 2000,2001 Dell Inc.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- * 
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- * 
- ************************************************************/
-#ifndef FS_PART_EFI_H_INCLUDED
-#define FS_PART_EFI_H_INCLUDED
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/string.h>
-#include <linux/efi.h>
-#define MSDOS_MBR_SIGNATURE 0xaa55
-#define EFI_PMBR_OSTYPE_EFI 0xEF
-#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
-#define GPT_HEADER_REVISION_V1 0x00010000
-#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
-#define PARTITION_SYSTEM_GUID \
-    EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \
-              0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) 
-#define LEGACY_MBR_PARTITION_GUID \
-    EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \
-              0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F)
-#define PARTITION_MSFT_RESERVED_GUID \
-    EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \
-              0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE)
-#define PARTITION_BASIC_DATA_GUID \
-    EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \
-              0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7)
-#define PARTITION_LINUX_RAID_GUID \
-    EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \
-              0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e)
-#define PARTITION_LINUX_SWAP_GUID \
-    EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \
-              0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f)
-#define PARTITION_LINUX_LVM_GUID \
-    EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
-              0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
-typedef struct _gpt_header {
-        __le64 signature;
-        __le32 revision;
-        __le32 header_size;
-        __le32 header_crc32;
-        __le32 reserved1;
-        __le64 my_lba;
-        __le64 alternate_lba;
-        __le64 first_usable_lba;
-        __le64 last_usable_lba;
-        efi_guid_t disk_guid;
-        __le64 partition_entry_lba;
-        __le32 num_partition_entries;
-        __le32 sizeof_partition_entry;
-        __le32 partition_entry_array_crc32;
-        /* The rest of the logical block is reserved by UEFI and must be zero.
-         * EFI standard handles this by:
-         *
-         * uint8_t              reserved2[ BlockSize - 92 ];
-         */
-} __attribute__ ((packed)) gpt_header;
-typedef struct _gpt_entry_attributes {
-        u64 required_to_function:1;
-        u64 reserved:47;
-        u64 type_guid_specific:16;
-} __attribute__ ((packed)) gpt_entry_attributes;
-typedef struct _gpt_entry {
-        efi_guid_t partition_type_guid;
-        efi_guid_t unique_partition_guid;
-        __le64 starting_lba;
-        __le64 ending_lba;
-        gpt_entry_attributes attributes;
-        efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
-} __attribute__ ((packed)) gpt_entry;
-typedef struct _legacy_mbr {
-        u8 boot_code[440];
-        __le32 unique_mbr_signature;
-        __le16 unknown;
-        struct partition partition_record[4];
-        __le16 signature;
-} __attribute__ ((packed)) legacy_mbr;
-/* Functions */
-extern int efi_partition(struct parsed_partitions *state);
-#endif
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * --------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 4 
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -4
- * c-argdecl-indent: 4
- * c-label-offset: -4
- * c-continued-statement-offset: 4
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
deleted file mode 100644
index 47a61474e79..00000000000
--- a/block/partitions/ibm.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
- *                  Volker Sameske <sameske@de.ibm.com>
- * Bugreports.to..: <Linux390@de.ibm.com>
- * Copyright IBM Corp. 1999, 2012
- */
-#include <linux/buffer_head.h>
-#include <linux/hdreg.h>
-#include <linux/slab.h>
-#include <asm/dasd.h>
-#include <asm/ebcdic.h>
-#include <asm/uaccess.h>
-#include <asm/vtoc.h>
-#include "check.h"
-#include "ibm.h"
-union label_t {
-        struct vtoc_volume_label_cdl vol;
-        struct vtoc_volume_label_ldl lnx;
-        struct vtoc_cms_label cms;
-};
-/*
- * compute the block number from a
- * cyl-cyl-head-head structure
- */
-static sector_t cchh2blk(struct vtoc_cchh *ptr, struct hd_geometry *geo)
-{
-        sector_t cyl;
-        __u16 head;
-        /* decode cylinder and heads for large volumes */
-        cyl = ptr->hh & 0xFFF0;
-        cyl <<= 12;
-        cyl |= ptr->cc;
-        head = ptr->hh & 0x000F;
-        return cyl * geo->heads * geo->sectors +
-               head * geo->sectors;
-}
-/*
- * compute the block number from a
- * cyl-cyl-head-head-block structure
- */
-static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
-{
-        sector_t cyl;
-        __u16 head;
-        /* decode cylinder and heads for large volumes */
-        cyl = ptr->hh & 0xFFF0;
-        cyl <<= 12;
-        cyl |= ptr->cc;
-        head = ptr->hh & 0x000F;
-        return  cyl * geo->heads * geo->sectors +
-                head * geo->sectors +
-                ptr->b;
-}
-static int find_label(struct parsed_partitions *state,
-                      dasd_information2_t *info,
-                      struct hd_geometry *geo,
-                      int blocksize,
-                      sector_t *labelsect,
-                      char name[],
-                      char type[],
-                      union label_t *label)
-{
-        Sector sect;
-        unsigned char *data;
-        sector_t testsect[3];
-        unsigned char temp[5];
-        int found = 0;
-        int i, testcount;
-        /* There a three places where we may find a valid label:
-         * - on an ECKD disk it's block 2
-         * - on an FBA disk it's block 1
-         * - on an CMS formatted FBA disk it is sector 1, even if the block size
-         *   is larger than 512 bytes (possible if the DIAG discipline is used)
-         * If we have a valid info structure, then we know exactly which case we
-         * have, otherwise we just search through all possebilities.
-         */
-        if (info) {
-                if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
-                    (info->cu_type == 0x3880 && info->dev_type == 0x3370))
-                        testsect[0] = info->label_block;
-                else
-                        testsect[0] = info->label_block * (blocksize >> 9);
-                testcount = 1;
-        } else {
-                testsect[0] = 1;
-                testsect[1] = (blocksize >> 9);
-                testsect[2] = 2 * (blocksize >> 9);
-                testcount = 3;
-        }
-        for (i = 0; i < testcount; ++i) {
-                data = read_part_sector(state, testsect[i], &sect);
-                if (data == NULL)
-                        continue;
-                memcpy(label, data, sizeof(*label));
-                memcpy(temp, data, 4);
-                temp[4] = 0;
-                EBCASC(temp, 4);
-                put_dev_sector(sect);
-                if (!strcmp(temp, "VOL1") ||
-                    !strcmp(temp, "LNX1") ||
-                    !strcmp(temp, "CMS1")) {
-                        if (!strcmp(temp, "VOL1")) {
-                                strncpy(type, label->vol.vollbl, 4);
-                                strncpy(name, label->vol.volid, 6);
-                        } else {
-                                strncpy(type, label->lnx.vollbl, 4);
-                                strncpy(name, label->lnx.volid, 6);
-                        }
-                        EBCASC(type, 4);
-                        EBCASC(name, 6);
-                        *labelsect = testsect[i];
-                        found = 1;
-                        break;
-                }
-        }
-        if (!found)
-                memset(label, 0, sizeof(*label));
-        return found;
-}
-static int find_vol1_partitions(struct parsed_partitions *state,
-                                struct hd_geometry *geo,
-                                int blocksize,
-                                char name[],
-                                union label_t *label)
-{
-        sector_t blk;
-        int counter;
-        char tmp[64];
-        Sector sect;
-        unsigned char *data;
-        loff_t offset, size;
-        struct vtoc_format1_label f1;
-        int secperblk;
-        snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
-        strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        /*
-         * get start of VTOC from the disk label and then search for format1
-         * and format8 labels
-         */
-        secperblk = blocksize >> 9;
-        blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
-        counter = 0;
-        data = read_part_sector(state, blk * secperblk, &sect);
-        while (data != NULL) {
-                memcpy(&f1, data, sizeof(struct vtoc_format1_label));
-                put_dev_sector(sect);
-                /* skip FMT4 / FMT5 / FMT7 labels */
-                if (f1.DS1FMTID == _ascebc['4']
-                    || f1.DS1FMTID == _ascebc['5']
-                    || f1.DS1FMTID == _ascebc['7']
-                    || f1.DS1FMTID == _ascebc['9']) {
-                        blk++;
-                        data = read_part_sector(state, blk * secperblk, &sect);
-                        continue;
-                }
-                /* only FMT1 and 8 labels valid at this point */
-                if (f1.DS1FMTID != _ascebc['1'] &&
-                    f1.DS1FMTID != _ascebc['8'])
-                        break;
-                /* OK, we got valid partition data */
-                offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
-                size  = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
-                        offset + geo->sectors;
-                offset *= secperblk;
-                size *= secperblk;
-                if (counter >= state->limit)
-                        break;
-                put_partition(state, counter + 1, offset, size);
-                counter++;
-                blk++;
-                data = read_part_sector(state, blk * secperblk, &sect);
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        if (!data)
-                return -1;
-        return 1;
-}
-static int find_lnx1_partitions(struct parsed_partitions *state,
-                                struct hd_geometry *geo,
-                                int blocksize,
-                                char name[],
-                                union label_t *label,
-                                sector_t labelsect,
-                                loff_t i_size,
-                                dasd_information2_t *info)
-{
-        loff_t offset, geo_size, size;
-        char tmp[64];
-        int secperblk;
-        snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
-        strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        secperblk = blocksize >> 9;
-        if (label->lnx.ldl_version == 0xf2) {
-                size = label->lnx.formatted_blocks * secperblk;
-        } else {
-                /*
-                 * Formated w/o large volume support. If the sanity check
-                 * 'size based on geo == size based on i_size' is true, then
-                 * we can safely assume that we know the formatted size of
-                 * the disk, otherwise we need additional information
-                 * that we can only get from a real DASD device.
-                 */
-                geo_size = geo->cylinders * geo->heads
-                        * geo->sectors * secperblk;
-                size = i_size >> 9;
-                if (size != geo_size) {
-                        if (!info) {
-                                strlcat(state->pp_buf, "\n", PAGE_SIZE);
-                                return 1;
-                        }
-                        if (!strcmp(info->type, "ECKD"))
-                                if (geo_size < size)
-                                        size = geo_size;
-                        /* else keep size based on i_size */
-                }
-        }
-        /* first and only partition starts in the first block after the label */
-        offset = labelsect + secperblk;
-        put_partition(state, 1, offset, size - offset);
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
-static int find_cms1_partitions(struct parsed_partitions *state,
-                                struct hd_geometry *geo,
-                                int blocksize,
-                                char name[],
-                                union label_t *label,
-                                sector_t labelsect)
-{
-        loff_t offset, size;
-        char tmp[64];
-        int secperblk;
-        /*
-         * VM style CMS1 labeled disk
-         */
-        blocksize = label->cms.block_size;
-        secperblk = blocksize >> 9;
-        if (label->cms.disk_offset != 0) {
-                snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                /* disk is reserved minidisk */
-                offset = label->cms.disk_offset * secperblk;
-                size = (label->cms.block_count - 1) * secperblk;
-        } else {
-                snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                /*
-                 * Special case for FBA devices:
-                 * If an FBA device is CMS formatted with blocksize > 512 byte
-                 * and the DIAG discipline is used, then the CMS label is found
-                 * in sector 1 instead of block 1. However, the partition is
-                 * still supposed to start in block 2.
-                 */
-                if (labelsect == 1)
-                        offset = 2 * secperblk;
-                else
-                        offset = labelsect + secperblk;
-                size = label->cms.block_count * secperblk;
-        }
-        put_partition(state, 1, offset, size-offset);
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
-/*
- * This is the main function, called by check.c
- */
-int ibm_partition(struct parsed_partitions *state)
-{
-        struct block_device *bdev = state->bdev;
-        int blocksize, res;
-        loff_t i_size, offset, size;
-        dasd_information2_t *info;
-        struct hd_geometry *geo;
-        char type[5] = {0,};
-        char name[7] = {0,};
-        sector_t labelsect;
-        union label_t *label;
-        res = 0;
-        blocksize = bdev_logical_block_size(bdev);
-        if (blocksize <= 0)
-                goto out_exit;
-        i_size = i_size_read(bdev->bd_inode);
-        if (i_size == 0)
-                goto out_exit;
-        info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
-        if (info == NULL)
-                goto out_exit;
-        geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
-        if (geo == NULL)
-                goto out_nogeo;
-        label = kmalloc(sizeof(union label_t), GFP_KERNEL);
-        if (label == NULL)
-                goto out_nolab;
-        if (ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
-                goto out_freeall;
-        if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0) {
-                kfree(info);
-                info = NULL;
-        }
-        if (find_label(state, info, geo, blocksize, &labelsect, name, type,
-                       label)) {
-                if (!strncmp(type, "VOL1", 4)) {
-                        res = find_vol1_partitions(state, geo, blocksize, name,
-                                                   label);
-                } else if (!strncmp(type, "LNX1", 4)) {
-                        res = find_lnx1_partitions(state, geo, blocksize, name,
-                                                   label, labelsect, i_size,
-                                                   info);
-                } else if (!strncmp(type, "CMS1", 4)) {
-                        res = find_cms1_partitions(state, geo, blocksize, name,
-                                                   label, labelsect);
-                }
-        } else if (info) {
-                /*
-                 * ugly but needed for backward compatibility:
-                 * If the block device is a DASD (i.e. BIODASDINFO2 works),
-                 * then we claim it in any case, even though it has no valid
-                 * label. If it has the LDL format, then we simply define a
-                 * partition as if it had an LNX1 label.
-                 */
-                res = 1;
-                if (info->format == DASD_FORMAT_LDL) {
-                        strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
-                        size = i_size >> 9;
-                        offset = (info->label_block + 1) * (blocksize >> 9);
-                        put_partition(state, 1, offset, size-offset);
-                        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-                }
-        } else
-                res = 0;
-out_freeall:
-        kfree(label);
-out_nolab:
-        kfree(geo);
-out_nogeo:
-        kfree(info);
-out_exit:
-        return res;
-}
diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h
deleted file mode 100644
index 08fb0804a81..00000000000
--- a/block/partitions/ibm.h
+++ /dev/null
@@ -1 +0,0 @@
-int ibm_partition(struct parsed_partitions *);
diff --git a/block/partitions/karma.c b/block/partitions/karma.c
deleted file mode 100644
index 0ea19312706..00000000000
--- a/block/partitions/karma.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  fs/partitions/karma.c
- *  Rio Karma partition info.
- *
- *  Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
- *  based on osf.c
- */
-#include "check.h"
-#include "karma.h"
-int karma_partition(struct parsed_partitions *state)
-{
-        int i;
-        int slot = 1;
-        Sector sect;
-        unsigned char *data;
-        struct disklabel {
-                u8 d_reserved[270];
-                struct d_partition {
-                        __le32 p_res;
-                        u8 p_fstype;
-                        u8 p_res2[3];
-                        __le32 p_offset;
-                        __le32 p_size;
-                } d_partitions[2];
-                u8 d_blank[208];
-                __le16 d_magic;
-        } __attribute__((packed)) *label;
-        struct d_partition *p;
-        data = read_part_sector(state, 0, &sect);
-        if (!data)
-                return -1;
-        label = (struct disklabel *)data;
-        if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        p = label->d_partitions;
-        for (i = 0 ; i < 2; i++, p++) {
-                if (slot == state->limit)
-                        break;
-                if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
-                        put_partition(state, slot, le32_to_cpu(p->p_offset),
-                                le32_to_cpu(p->p_size));
-                }
-                slot++;
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        put_dev_sector(sect);
-        return 1;
-}
diff --git a/block/partitions/karma.h b/block/partitions/karma.h
deleted file mode 100644
index c764b2e9df2..00000000000
--- a/block/partitions/karma.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- *  fs/partitions/karma.h
- */
-#define KARMA_LABEL_MAGIC               0xAB56
-int karma_partition(struct parsed_partitions *state);
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
deleted file mode 100644
index e507cfbd044..00000000000
--- a/block/partitions/ldm.c
+++ /dev/null
@@ -1,1567 +0,0 @@
-/**
- * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
- *
- * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
- * Copyright (c) 2001-2012 Anton Altaparmakov
- * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
- *
- * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
- *
- * This program is free software; you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program (in the main directory of the source in the file COPYING); if
- * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- * Boston, MA  02111-1307  USA
- */
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/stringify.h>
-#include <linux/kernel.h>
-#include "ldm.h"
-#include "check.h"
-#include "msdos.h"
-/**
- * ldm_debug/info/error/crit - Output an error message
- * @f:    A printf format string containing the message
- * @...:  Variables to substitute into @f
- *
- * ldm_debug() writes a DEBUG level message to the syslog but only if the
- * driver was compiled with debug enabled. Otherwise, the call turns into a NOP.
- */
-#ifndef CONFIG_LDM_DEBUG
-#define ldm_debug(...)  do {} while (0)
-#else
-#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a)
-#endif
-#define ldm_crit(f, a...)  _ldm_printk (KERN_CRIT,  __func__, f, ##a)
-#define ldm_error(f, a...) _ldm_printk (KERN_ERR,   __func__, f, ##a)
-#define ldm_info(f, a...)  _ldm_printk (KERN_INFO,  __func__, f, ##a)
-static __printf(3, 4)
-void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
-{
-        struct va_format vaf;
-        va_list args;
-        va_start (args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        printk("%s%s(): %pV\n", level, function, &vaf);
-        va_end(args);
-}
-/**
- * ldm_parse_hexbyte - Convert a ASCII hex number to a byte
- * @src:  Pointer to at least 2 characters to convert.
- *
- * Convert a two character ASCII hex string to a number.
- *
- * Return:  0-255  Success, the byte was parsed correctly
- *          -1     Error, an invalid character was supplied
- */
-static int ldm_parse_hexbyte (const u8 *src)
-{
-        unsigned int x;         /* For correct wrapping */
-        int h;
-        /* high part */
-        x = h = hex_to_bin(src[0]);
-        if (h < 0)
-                return -1;
-        /* low part */
-        h = hex_to_bin(src[1]);
-        if (h < 0)
-                return -1;
-        return (x << 4) + h;
-}
-/**
- * ldm_parse_guid - Convert GUID from ASCII to binary
- * @src:   36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
- * @dest:  Memory block to hold binary GUID (16 bytes)
- *
- * N.B. The GUID need not be NULL terminated.
- *
- * Return:  'true'   @dest contains binary GUID
- *          'false'  @dest contents are undefined
- */
-static bool ldm_parse_guid (const u8 *src, u8 *dest)
-{
-        static const int size[] = { 4, 2, 2, 2, 6 };
-        int i, j, v;
-        if (src[8]  != '-' || src[13] != '-' ||
-            src[18] != '-' || src[23] != '-')
-                return false;
-        for (j = 0; j < 5; j++, src++)
-                for (i = 0; i < size[j]; i++, src+=2, *dest++ = v)
-                        if ((v = ldm_parse_hexbyte (src)) < 0)
-                                return false;
-        return true;
-}
-/**
- * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure
- * @data:  Raw database PRIVHEAD structure loaded from the device
- * @ph:    In-memory privhead structure in which to return parsed information
- *
- * This parses the LDM database PRIVHEAD structure supplied in @data and
- * sets up the in-memory privhead structure @ph with the obtained information.
- *
- * Return:  'true'   @ph contains the PRIVHEAD data
- *          'false'  @ph contents are undefined
- */
-static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
-{
-        bool is_vista = false;
-        BUG_ON(!data || !ph);
-        if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
-                ldm_error("Cannot find PRIVHEAD structure. LDM database is"
-                        " corrupt. Aborting.");
-                return false;
-        }
-        ph->ver_major = get_unaligned_be16(data + 0x000C);
-        ph->ver_minor = get_unaligned_be16(data + 0x000E);
-        ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
-        ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
-        ph->config_start = get_unaligned_be64(data + 0x012B);
-        ph->config_size = get_unaligned_be64(data + 0x0133);
-        /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
-        if (ph->ver_major == 2 && ph->ver_minor == 12)
-                is_vista = true;
-        if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) {
-                ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d."
-                        " Aborting.", ph->ver_major, ph->ver_minor);
-                return false;
-        }
-        ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major,
-                        ph->ver_minor, is_vista ? "Vista" : "2000/XP");
-        if (ph->config_size != LDM_DB_SIZE) {   /* 1 MiB in sectors. */
-                /* Warn the user and continue, carefully. */
-                ldm_info("Database is normally %u bytes, it claims to "
-                        "be %llu bytes.", LDM_DB_SIZE,
-                        (unsigned long long)ph->config_size);
-        }
-        if ((ph->logical_disk_size == 0) || (ph->logical_disk_start +
-                        ph->logical_disk_size > ph->config_start)) {
-                ldm_error("PRIVHEAD disk size doesn't match real disk size");
-                return false;
-        }
-        if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) {
-                ldm_error("PRIVHEAD contains an invalid GUID.");
-                return false;
-        }
-        ldm_debug("Parsed PRIVHEAD successfully.");
-        return true;
-}
-/**
- * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure
- * @data:  Raw database TOCBLOCK structure loaded from the device
- * @toc:   In-memory toc structure in which to return parsed information
- *
- * This parses the LDM Database TOCBLOCK (table of contents) structure supplied
- * in @data and sets up the in-memory tocblock structure @toc with the obtained
- * information.
- *
- * N.B.  The *_start and *_size values returned in @toc are not range-checked.
- *
- * Return:  'true'   @toc contains the TOCBLOCK data
- *          'false'  @toc contents are undefined
- */
-static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
-{
-        BUG_ON (!data || !toc);
-        if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
-                ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
-                return false;
-        }
-        strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
-        toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
-        toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
-        toc->bitmap1_size  = get_unaligned_be64(data + 0x36);
-        if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
-                        sizeof (toc->bitmap1_name)) != 0) {
-                ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.",
-                                TOC_BITMAP1, toc->bitmap1_name);
-                return false;
-        }
-        strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
-        toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
-        toc->bitmap2_start = get_unaligned_be64(data + 0x50);
-        toc->bitmap2_size  = get_unaligned_be64(data + 0x58);
-        if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
-                        sizeof (toc->bitmap2_name)) != 0) {
-                ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
-                                TOC_BITMAP2, toc->bitmap2_name);
-                return false;
-        }
-        ldm_debug ("Parsed TOCBLOCK successfully.");
-        return true;
-}
-/**
- * ldm_parse_vmdb - Read the LDM Database VMDB structure
- * @data:  Raw database VMDB structure loaded from the device
- * @vm:    In-memory vmdb structure in which to return parsed information
- *
- * This parses the LDM Database VMDB structure supplied in @data and sets up
- * the in-memory vmdb structure @vm with the obtained information.
- *
- * N.B.  The *_start, *_size and *_seq values will be range-checked later.
- *
- * Return:  'true'   @vm contains VMDB info
- *          'false'  @vm contents are undefined
- */
-static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
-{
-        BUG_ON (!data || !vm);
-        if (MAGIC_VMDB != get_unaligned_be32(data)) {
-                ldm_crit ("Cannot find the VMDB, database may be corrupt.");
-                return false;
-        }
-        vm->ver_major = get_unaligned_be16(data + 0x12);
-        vm->ver_minor = get_unaligned_be16(data + 0x14);
-        if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
-                ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
-                        "Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
-                return false;
-        }
-        vm->vblk_size     = get_unaligned_be32(data + 0x08);
-        if (vm->vblk_size == 0) {
-                ldm_error ("Illegal VBLK size");
-                return false;
-        }
-        vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
-        vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
-        ldm_debug ("Parsed VMDB successfully.");
-        return true;
-}
-/**
- * ldm_compare_privheads - Compare two privhead objects
- * @ph1:  First privhead
- * @ph2:  Second privhead
- *
- * This compares the two privhead structures @ph1 and @ph2.
- *
- * Return:  'true'   Identical
- *          'false'  Different
- */
-static bool ldm_compare_privheads (const struct privhead *ph1,
-                                   const struct privhead *ph2)
-{
-        BUG_ON (!ph1 || !ph2);
-        return ((ph1->ver_major          == ph2->ver_major)             &&
-                (ph1->ver_minor          == ph2->ver_minor)             &&
-                (ph1->logical_disk_start == ph2->logical_disk_start)    &&
-                (ph1->logical_disk_size  == ph2->logical_disk_size)     &&
-                (ph1->config_start       == ph2->config_start)          &&
-                (ph1->config_size        == ph2->config_size)           &&
-                !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE));
-}
-/**
- * ldm_compare_tocblocks - Compare two tocblock objects
- * @toc1:  First toc
- * @toc2:  Second toc
- *
- * This compares the two tocblock structures @toc1 and @toc2.
- *
- * Return:  'true'   Identical
- *          'false'  Different
- */
-static bool ldm_compare_tocblocks (const struct tocblock *toc1,
-                                   const struct tocblock *toc2)
-{
-        BUG_ON (!toc1 || !toc2);
-        return ((toc1->bitmap1_start == toc2->bitmap1_start)    &&
-                (toc1->bitmap1_size  == toc2->bitmap1_size)     &&
-                (toc1->bitmap2_start == toc2->bitmap2_start)    &&
-                (toc1->bitmap2_size  == toc2->bitmap2_size)     &&
-                !strncmp (toc1->bitmap1_name, toc2->bitmap1_name,
-                        sizeof (toc1->bitmap1_name))            &&
-                !strncmp (toc1->bitmap2_name, toc2->bitmap2_name,
-                        sizeof (toc1->bitmap2_name)));
-}
-/**
- * ldm_validate_privheads - Compare the primary privhead with its backups
- * @state: Partition check state including device holding the LDM Database
- * @ph1:   Memory struct to fill with ph contents
- *
- * Read and compare all three privheads from disk.
- *
- * The privheads on disk show the size and location of the main disk area and
- * the configuration area (the database).  The values are range-checked against
- * @hd, which contains the real size of the disk.
- *
- * Return:  'true'   Success
- *          'false'  Error
- */
-static bool ldm_validate_privheads(struct parsed_partitions *state,
-                                   struct privhead *ph1)
-{
-        static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
-        struct privhead *ph[3] = { ph1 };
-        Sector sect;
-        u8 *data;
-        bool result = false;
-        long num_sects;
-        int i;
-        BUG_ON (!state || !ph1);
-        ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
-        ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
-        if (!ph[1] || !ph[2]) {
-                ldm_crit ("Out of memory.");
-                goto out;
-        }
-        /* off[1 & 2] are relative to ph[0]->config_start */
-        ph[0]->config_start = 0;
-        /* Read and parse privheads */
-        for (i = 0; i < 3; i++) {
-                data = read_part_sector(state, ph[0]->config_start + off[i],
-                                        &sect);
-                if (!data) {
-                        ldm_crit ("Disk read failed.");
-                        goto out;
-                }
-                result = ldm_parse_privhead (data, ph[i]);
-                put_dev_sector (sect);
-                if (!result) {
-                        ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */
-                        if (i < 2)
-                                goto out;       /* Already logged */
-                        else
-                                break;  /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */
-                }
-        }
-        num_sects = state->bdev->bd_inode->i_size >> 9;
-        if ((ph[0]->config_start > num_sects) ||
-           ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
-                ldm_crit ("Database extends beyond the end of the disk.");
-                goto out;
-        }
-        if ((ph[0]->logical_disk_start > ph[0]->config_start) ||
-           ((ph[0]->logical_disk_start + ph[0]->logical_disk_size)
-                    > ph[0]->config_start)) {
-                ldm_crit ("Disk and database overlap.");
-                goto out;
-        }
-        if (!ldm_compare_privheads (ph[0], ph[1])) {
-                ldm_crit ("Primary and backup PRIVHEADs don't match.");
-                goto out;
-        }
-        /* FIXME ignore this for now
-        if (!ldm_compare_privheads (ph[0], ph[2])) {
-                ldm_crit ("Primary and backup PRIVHEADs don't match.");
-                goto out;
-        }*/
-        ldm_debug ("Validated PRIVHEADs successfully.");
-        result = true;
-out:
-        kfree (ph[1]);
-        kfree (ph[2]);
-        return result;
-}
-/**
- * ldm_validate_tocblocks - Validate the table of contents and its backups
- * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @state->bdev, of the database
- * @ldb:   Cache of the database structures
- *
- * Find and compare the four tables of contents of the LDM Database stored on
- * @state->bdev and return the parsed information into @toc1.
- *
- * The offsets and sizes of the configs are range-checked against a privhead.
- *
- * Return:  'true'   @toc1 contains validated TOCBLOCK info
- *          'false'  @toc1 contents are undefined
- */
-static bool ldm_validate_tocblocks(struct parsed_partitions *state,
-                                   unsigned long base, struct ldmdb *ldb)
-{
-        static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
-        struct tocblock *tb[4];
-        struct privhead *ph;
-        Sector sect;
-        u8 *data;
-        int i, nr_tbs;
-        bool result = false;
-        BUG_ON(!state || !ldb);
-        ph = &ldb->ph;
-        tb[0] = &ldb->toc;
-        tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
-        if (!tb[1]) {
-                ldm_crit("Out of memory.");
-                goto err;
-        }
-        tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1]));
-        tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2]));
-        /*
-         * Try to read and parse all four TOCBLOCKs.
-         *
-         * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so
-         * skip any that fail as long as we get at least one valid TOCBLOCK.
-         */
-        for (nr_tbs = i = 0; i < 4; i++) {
-                data = read_part_sector(state, base + off[i], &sect);
-                if (!data) {
-                        ldm_error("Disk read failed for TOCBLOCK %d.", i);
-                        continue;
-                }
-                if (ldm_parse_tocblock(data, tb[nr_tbs]))
-                        nr_tbs++;
-                put_dev_sector(sect);
-        }
-        if (!nr_tbs) {
-                ldm_crit("Failed to find a valid TOCBLOCK.");
-                goto err;
-        }
-        /* Range check the TOCBLOCK against a privhead. */
-        if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) ||
-                        ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) >
-                        ph->config_size)) {
-                ldm_crit("The bitmaps are out of range.  Giving up.");
-                goto err;
-        }
-        /* Compare all loaded TOCBLOCKs. */
-        for (i = 1; i < nr_tbs; i++) {
-                if (!ldm_compare_tocblocks(tb[0], tb[i])) {
-                        ldm_crit("TOCBLOCKs 0 and %d do not match.", i);
-                        goto err;
-                }
-        }
-        ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs);
-        result = true;
-err:
-        kfree(tb[1]);
-        return result;
-}
-/**
- * ldm_validate_vmdb - Read the VMDB and validate it
- * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
- * @ldb:   Cache of the database structures
- *
- * Find the vmdb of the LDM Database stored on @bdev and return the parsed
- * information in @ldb.
- *
- * Return:  'true'   @ldb contains validated VBDB info
- *          'false'  @ldb contents are undefined
- */
-static bool ldm_validate_vmdb(struct parsed_partitions *state,
-                              unsigned long base, struct ldmdb *ldb)
-{
-        Sector sect;
-        u8 *data;
-        bool result = false;
-        struct vmdb *vm;
-        struct tocblock *toc;
-        BUG_ON (!state || !ldb);
-        vm  = &ldb->vm;
-        toc = &ldb->toc;
-        data = read_part_sector(state, base + OFF_VMDB, &sect);
-        if (!data) {
-                ldm_crit ("Disk read failed.");
-                return false;
-        }
-        if (!ldm_parse_vmdb (data, vm))
-                goto out;                               /* Already logged */
-        /* Are there uncommitted transactions? */
-        if (get_unaligned_be16(data + 0x10) != 0x01) {
-                ldm_crit ("Database is not in a consistent state.  Aborting.");
-                goto out;
-        }
-        if (vm->vblk_offset != 512)
-                ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset);
-        /*
-         * The last_vblkd_seq can be before the end of the vmdb, just make sure
-         * it is not out of bounds.
-         */
-        if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) {
-                ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK.  "
-                                "Database is corrupt.  Aborting.");
-                goto out;
-        }
-        result = true;
-out:
-        put_dev_sector (sect);
-        return result;
-}
-/**
- * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @state: Partition check state including device holding the LDM Database
- *
- * This function provides a weak test to decide whether the device is a dynamic
- * disk or not.  It looks for an MS-DOS-style partition table containing at
- * least one partition of type 0x42 (formerly SFS, now used by Windows for
- * dynamic disks).
- *
- * N.B.  The only possible error can come from the read_part_sector and that is
- *       only likely to happen if the underlying device is strange.  If that IS
- *       the case we should return zero to let someone else try.
- *
- * Return:  'true'   @state->bdev is a dynamic disk
- *          'false'  @state->bdev is not a dynamic disk, or an error occurred
- */
-static bool ldm_validate_partition_table(struct parsed_partitions *state)
-{
-        Sector sect;
-        u8 *data;
-        struct partition *p;
-        int i;
-        bool result = false;
-        BUG_ON(!state);
-        data = read_part_sector(state, 0, &sect);
-        if (!data) {
-                ldm_info ("Disk read failed.");
-                return false;
-        }
-        if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC))
-                goto out;
-        p = (struct partition*)(data + 0x01BE);
-        for (i = 0; i < 4; i++, p++)
-                if (SYS_IND (p) == LDM_PARTITION) {
-                        result = true;
-                        break;
-                }
-        if (result)
-                ldm_debug ("Found W2K dynamic disk partition type.");
-out:
-        put_dev_sector (sect);
-        return result;
-}
-/**
- * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id
- * @ldb:  Cache of the database structures
- *
- * The LDM Database contains a list of all partitions on all dynamic disks.
- * The primary PRIVHEAD, at the beginning of the physical disk, tells us
- * the GUID of this disk.  This function searches for the GUID in a linked
- * list of vblk's.
- *
- * Return:  Pointer, A matching vblk was found
- *          NULL,    No match, or an error
- */
-static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb)
-{
-        struct list_head *item;
-        BUG_ON (!ldb);
-        list_for_each (item, &ldb->v_disk) {
-                struct vblk *v = list_entry (item, struct vblk, list);
-                if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE))
-                        return v;
-        }
-        return NULL;
-}
-/**
- * ldm_create_data_partitions - Create data partitions for this device
- * @pp:   List of the partitions parsed so far
- * @ldb:  Cache of the database structures
- *
- * The database contains ALL the partitions for ALL disk groups, so we need to
- * filter out this specific disk. Using the disk's object id, we can find all
- * the partitions in the database that belong to this disk.
- *
- * Add each partition in our database, to the parsed_partitions structure.
- *
- * N.B.  This function creates the partitions in the order it finds partition
- *       objects in the linked list.
- *
- * Return:  'true'   Partition created
- *          'false'  Error, probably a range checking problem
- */
-static bool ldm_create_data_partitions (struct parsed_partitions *pp,
-                                        const struct ldmdb *ldb)
-{
-        struct list_head *item;
-        struct vblk *vb;
-        struct vblk *disk;
-        struct vblk_part *part;
-        int part_num = 1;
-        BUG_ON (!pp || !ldb);
-        disk = ldm_get_disk_objid (ldb);
-        if (!disk) {
-                ldm_crit ("Can't find the ID of this disk in the database.");
-                return false;
-        }
-        strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
-        /* Create the data partitions */
-        list_for_each (item, &ldb->v_part) {
-                vb = list_entry (item, struct vblk, list);
-                part = &vb->vblk.part;
-                if (part->disk_id != disk->obj_id)
-                        continue;
-                put_partition (pp, part_num, ldb->ph.logical_disk_start +
-                                part->start, part->size);
-                part_num++;
-        }
-        strlcat(pp->pp_buf, "\n", PAGE_SIZE);
-        return true;
-}
-/**
- * ldm_relative - Calculate the next relative offset
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @base:    Size of the previous fixed width fields
- * @offset:  Cumulative size of the previous variable-width fields
- *
- * Because many of the VBLK fields are variable-width, it's necessary
- * to calculate each offset based on the previous one and the length
- * of the field it pointed to.
- *
- * Return:  -1 Error, the calculated offset exceeded the size of the buffer
- *           n OK, a range-checked offset into buffer
- */
-static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
-{
-        base += offset;
-        if (!buffer || offset < 0 || base > buflen) {
-                if (!buffer)
-                        ldm_error("!buffer");
-                if (offset < 0)
-                        ldm_error("offset (%d) < 0", offset);
-                if (base > buflen)
-                        ldm_error("base (%d) > buflen (%d)", base, buflen);
-                return -1;
-        }
-        if (base + buffer[base] >= buflen) {
-                ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
-                                buffer[base], buflen);
-                return -1;
-        }
-        return buffer[base] + offset + 1;
-}
-/**
- * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order
- * @block:  Pointer to the variable-width number to convert
- *
- * Large numbers in the LDM Database are often stored in a packed format.  Each
- * number is prefixed by a one byte width marker.  All numbers in the database
- * are stored in big-endian byte order.  This function reads one of these
- * numbers and returns the result
- *
- * N.B.  This function DOES NOT perform any range checking, though the most
- *       it will read is eight bytes.
- *
- * Return:  n A number
- *          0 Zero, or an error occurred
- */
-static u64 ldm_get_vnum (const u8 *block)
-{
-        u64 tmp = 0;
-        u8 length;
-        BUG_ON (!block);
-        length = *block++;
-        if (length && length <= 8)
-                while (length--)
-                        tmp = (tmp << 8) | *block++;
-        else
-                ldm_error ("Illegal length %d.", length);
-        return tmp;
-}
-/**
- * ldm_get_vstr - Read a length-prefixed string into a buffer
- * @block:   Pointer to the length marker
- * @buffer:  Location to copy string to
- * @buflen:  Size of the output buffer
- *
- * Many of the strings in the LDM Database are not NULL terminated.  Instead
- * they are prefixed by a one byte length marker.  This function copies one of
- * these strings into a buffer.
- *
- * N.B.  This function DOES NOT perform any range checking on the input.
- *       If the buffer is too small, the output will be truncated.
- *
- * Return:  0, Error and @buffer contents are undefined
- *          n, String length in characters (excluding NULL)
- *          buflen-1, String was truncated.
- */
-static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen)
-{
-        int length;
-        BUG_ON (!block || !buffer);
-        length = block[0];
-        if (length >= buflen) {
-                ldm_error ("Truncating string %d -> %d.", length, buflen);
-                length = buflen - 1;
-        }
-        memcpy (buffer, block + 1, length);
-        buffer[length] = 0;
-        return length;
-}
-/**
- * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @vb:      In-memory vblk in which to return information
- *
- * Read a raw VBLK Component object (version 3) into a vblk structure.
- *
- * Return:  'true'   @vb contains a Component VBLK
- *          'false'  @vb contents are not defined
- */
-static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
-{
-        int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len;
-        struct vblk_comp *comp;
-        BUG_ON (!buffer || !vb);
-        r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
-        r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
-        r_vstate = ldm_relative (buffer, buflen, 0x18, r_name);
-        r_child  = ldm_relative (buffer, buflen, 0x1D, r_vstate);
-        r_parent = ldm_relative (buffer, buflen, 0x2D, r_child);
-        if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) {
-                r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent);
-                r_cols   = ldm_relative (buffer, buflen, 0x2E, r_stripe);
-                len = r_cols;
-        } else {
-                r_stripe = 0;
-                r_cols   = 0;
-                len = r_parent;
-        }
-        if (len < 0)
-                return false;
-        len += VBLK_SIZE_CMP3;
-        if (len != get_unaligned_be32(buffer + 0x14))
-                return false;
-        comp = &vb->vblk.comp;
-        ldm_get_vstr (buffer + 0x18 + r_name, comp->state,
-                sizeof (comp->state));
-        comp->type      = buffer[0x18 + r_vstate];
-        comp->children  = ldm_get_vnum (buffer + 0x1D + r_vstate);
-        comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child);
-        comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0;
-        return true;
-}
-/**
- * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @vb:      In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk Group object (version 3) into a vblk structure.
- *
- * Return:  'true'   @vb contains a Disk Group VBLK
- *          'false'  @vb contents are not defined
- */
-static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
-{
-        int r_objid, r_name, r_diskid, r_id1, r_id2, len;
-        struct vblk_dgrp *dgrp;
-        BUG_ON (!buffer || !vb);
-        r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
-        r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
-        r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
-        if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) {
-                r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid);
-                r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1);
-                len = r_id2;
-        } else {
-                r_id1 = 0;
-                r_id2 = 0;
-                len = r_diskid;
-        }
-        if (len < 0)
-                return false;
-        len += VBLK_SIZE_DGR3;
-        if (len != get_unaligned_be32(buffer + 0x14))
-                return false;
-        dgrp = &vb->vblk.dgrp;
-        ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id,
-                sizeof (dgrp->disk_id));
-        return true;
-}
-/**
- * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @vb:      In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk Group object (version 4) into a vblk structure.
- *
- * Return:  'true'   @vb contains a Disk Group VBLK
- *          'false'  @vb contents are not defined
- */
-static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
-{
-        char buf[64];
-        int r_objid, r_name, r_id1, r_id2, len;
-        struct vblk_dgrp *dgrp;
-        BUG_ON (!buffer || !vb);
-        r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
-        r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
-        if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) {
-                r_id1 = ldm_relative (buffer, buflen, 0x44, r_name);
-                r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1);
-                len = r_id2;
-        } else {
-                r_id1 = 0;
-                r_id2 = 0;
-                len = r_name;
-        }
-        if (len < 0)
-                return false;
-        len += VBLK_SIZE_DGR4;
-        if (len != get_unaligned_be32(buffer + 0x14))
-                return false;
-        dgrp = &vb->vblk.dgrp;
-        ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
-        return true;
-}
-/**
- * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @vb:      In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk object (version 3) into a vblk structure.
- *
- * Return:  'true'   @vb contains a Disk VBLK
- *          'false'  @vb contents are not defined
- */
-static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
-{
-        int r_objid, r_name, r_diskid, r_altname, len;
-        struct vblk_disk *disk;
-        BUG_ON (!buffer || !vb);
-        r_objid   = ldm_relative (buffer, buflen, 0x18, 0);
-        r_name    = ldm_relative (buffer, buflen, 0x18, r_objid);
-        r_diskid  = ldm_relative (buffer, buflen, 0x18, r_name);
-        r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid);
-        len = r_altname;
-        if (len < 0)
-                return false;
-        len += VBLK_SIZE_DSK3;
-        if (len != get_unaligned_be32(buffer + 0x14))
-                return false;
-        disk = &vb->vblk.disk;
-        ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
-                sizeof (disk->alt_name));
-        if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id))
-                return false;
-        return true;
-}
-/**
- * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @vb:      In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk object (version 4) into a vblk structure.
- *
- * Return:  'true'   @vb contains a Disk VBLK
- *          'false'  @vb contents are not defined
- */
-static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
-{
-        int r_objid, r_name, len;
-        struct vblk_disk *disk;
-        BUG_ON (!buffer || !vb);
-        r_objid = ldm_relative (buffer, buflen, 0x18, 0);
-        r_name  = ldm_relative (buffer, buflen, 0x18, r_objid);
-        len     = r_name;
-        if (len < 0)
-                return false;
-        len += VBLK_SIZE_DSK4;
-        if (len != get_unaligned_be32(buffer + 0x14))
-                return false;
-        disk = &vb->vblk.disk;
-        memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE);
-        return true;
-}
-/**
- * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @vb:      In-memory vblk in which to return information
- *
- * Read a raw VBLK Partition object (version 3) into a vblk structure.
- *
- * Return:  'true'   @vb contains a Partition VBLK
- *          'false'  @vb contents are not defined
- */
-static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
-{
-        int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len;
-        struct vblk_part *part;
-        BUG_ON(!buffer || !vb);
-        r_objid = ldm_relative(buffer, buflen, 0x18, 0);
-        if (r_objid < 0) {
-                ldm_error("r_objid %d < 0", r_objid);
-                return false;
-        }
-        r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
-        if (r_name < 0) {
-                ldm_error("r_name %d < 0", r_name);
-                return false;
-        }
-        r_size = ldm_relative(buffer, buflen, 0x34, r_name);
-        if (r_size < 0) {
-                ldm_error("r_size %d < 0", r_size);
-                return false;
-        }
-        r_parent = ldm_relative(buffer, buflen, 0x34, r_size);
-        if (r_parent < 0) {
-                ldm_error("r_parent %d < 0", r_parent);
-                return false;
-        }
-        r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent);
-        if (r_diskid < 0) {
-                ldm_error("r_diskid %d < 0", r_diskid);
-                return false;
-        }
-        if (buffer[0x12] & VBLK_FLAG_PART_INDEX) {
-                r_index = ldm_relative(buffer, buflen, 0x34, r_diskid);
-                if (r_index < 0) {
-                        ldm_error("r_index %d < 0", r_index);
-                        return false;
-                }
-                len = r_index;
-        } else {
-                r_index = 0;
-                len = r_diskid;
-        }
-        if (len < 0) {
-                ldm_error("len %d < 0", len);
-                return false;
-        }
-        len += VBLK_SIZE_PRT3;
-        if (len > get_unaligned_be32(buffer + 0x14)) {
-                ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-                                get_unaligned_be32(buffer + 0x14));
-                return false;
-        }
-        part = &vb->vblk.part;
-        part->start = get_unaligned_be64(buffer + 0x24 + r_name);
-        part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
-        part->size = ldm_get_vnum(buffer + 0x34 + r_name);
-        part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
-        part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
-        if (vb->flags & VBLK_FLAG_PART_INDEX)
-                part->partnum = buffer[0x35 + r_diskid];
-        else
-                part->partnum = 0;
-        return true;
-}
-/**
- * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure
- * @buffer:  Block of data being worked on
- * @buflen:  Size of the block of data
- * @vb:      In-memory vblk in which to return information
- *
- * Read a raw VBLK Volume object (version 5) into a vblk structure.
- *
- * Return:  'true'   @vb contains a Volume VBLK
- *          'false'  @vb contents are not defined
- */
-static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
-{
-        int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
-        int r_id1, r_id2, r_size2, r_drive, len;
-        struct vblk_volu *volu;
-        BUG_ON(!buffer || !vb);
-        r_objid = ldm_relative(buffer, buflen, 0x18, 0);
-        if (r_objid < 0) {
-                ldm_error("r_objid %d < 0", r_objid);
-                return false;
-        }
-        r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
-        if (r_name < 0) {
-                ldm_error("r_name %d < 0", r_name);
-                return false;
-        }
-        r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
-        if (r_vtype < 0) {
-                ldm_error("r_vtype %d < 0", r_vtype);
-                return false;
-        }
-        r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
-        if (r_disable_drive_letter < 0) {
-                ldm_error("r_disable_drive_letter %d < 0",
-                                r_disable_drive_letter);
-                return false;
-        }
-        r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
-        if (r_child < 0) {
-                ldm_error("r_child %d < 0", r_child);
-                return false;
-        }
-        r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
-        if (r_size < 0) {
-                ldm_error("r_size %d < 0", r_size);
-                return false;
-        }
-        if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
-                r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
-                if (r_id1 < 0) {
-                        ldm_error("r_id1 %d < 0", r_id1);
-                        return false;
-                }
-        } else
-                r_id1 = r_size;
-        if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
-                r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
-                if (r_id2 < 0) {
-                        ldm_error("r_id2 %d < 0", r_id2);
-                        return false;
-                }
-        } else
-                r_id2 = r_id1;
-        if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
-                r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
-                if (r_size2 < 0) {
-                        ldm_error("r_size2 %d < 0", r_size2);
-                        return false;
-                }
-        } else
-                r_size2 = r_id2;
-        if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
-                r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
-                if (r_drive < 0) {
-                        ldm_error("r_drive %d < 0", r_drive);
-                        return false;
-                }
-        } else
-                r_drive = r_size2;
-        len = r_drive;
-        if (len < 0) {
-                ldm_error("len %d < 0", len);
-                return false;
-        }
-        len += VBLK_SIZE_VOL5;
-        if (len > get_unaligned_be32(buffer + 0x14)) {
-                ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-                                get_unaligned_be32(buffer + 0x14));
-                return false;
-        }
-        volu = &vb->vblk.volu;
-        ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
-                        sizeof(volu->volume_type));
-        memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
-                        sizeof(volu->volume_state));
-        volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
-        volu->partition_type = buffer[0x41 + r_size];
-        memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
-        if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
-                ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
-                                sizeof(volu->drive_hint));
-        }
-        return true;
-}
-/**
- * ldm_parse_vblk - Read a raw VBLK object into a vblk structure
- * @buf:  Block of data being worked on
- * @len:  Size of the block of data
- * @vb:   In-memory vblk in which to return information
- *
- * Read a raw VBLK object into a vblk structure.  This function just reads the
- * information common to all VBLK types, then delegates the rest of the work to
- * helper functions: ldm_parse_*.
- *
- * Return:  'true'   @vb contains a VBLK
- *          'false'  @vb contents are not defined
- */
-static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
-{
-        bool result = false;
-        int r_objid;
-        BUG_ON (!buf || !vb);
-        r_objid = ldm_relative (buf, len, 0x18, 0);
-        if (r_objid < 0) {
-                ldm_error ("VBLK header is corrupt.");
-                return false;
-        }
-        vb->flags  = buf[0x12];
-        vb->type   = buf[0x13];
-        vb->obj_id = ldm_get_vnum (buf + 0x18);
-        ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name));
-        switch (vb->type) {
-                case VBLK_CMP3:  result = ldm_parse_cmp3 (buf, len, vb); break;
-                case VBLK_DSK3:  result = ldm_parse_dsk3 (buf, len, vb); break;
-                case VBLK_DSK4:  result = ldm_parse_dsk4 (buf, len, vb); break;
-                case VBLK_DGR3:  result = ldm_parse_dgr3 (buf, len, vb); break;
-                case VBLK_DGR4:  result = ldm_parse_dgr4 (buf, len, vb); break;
-                case VBLK_PRT3:  result = ldm_parse_prt3 (buf, len, vb); break;
-                case VBLK_VOL5:  result = ldm_parse_vol5 (buf, len, vb); break;
-        }
-        if (result)
-                ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.",
-                         (unsigned long long) vb->obj_id, vb->type);
-        else
-                ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).",
-                        (unsigned long long) vb->obj_id, vb->type);
-        return result;
-}
-/**
- * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database
- * @data:  Raw VBLK to add to the database
- * @len:   Size of the raw VBLK
- * @ldb:   Cache of the database structures
- *
- * The VBLKs are sorted into categories.  Partitions are also sorted by offset.
- *
- * N.B.  This function does not check the validity of the VBLKs.
- *
- * Return:  'true'   The VBLK was added
- *          'false'  An error occurred
- */
-static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
-{
-        struct vblk *vb;
-        struct list_head *item;
-        BUG_ON (!data || !ldb);
-        vb = kmalloc (sizeof (*vb), GFP_KERNEL);
-        if (!vb) {
-                ldm_crit ("Out of memory.");
-                return false;
-        }
-        if (!ldm_parse_vblk (data, len, vb)) {
-                kfree(vb);
-                return false;                   /* Already logged */
-        }
-        /* Put vblk into the correct list. */
-        switch (vb->type) {
-        case VBLK_DGR3:
-        case VBLK_DGR4:
-                list_add (&vb->list, &ldb->v_dgrp);
-                break;
-        case VBLK_DSK3:
-        case VBLK_DSK4:
-                list_add (&vb->list, &ldb->v_disk);
-                break;
-        case VBLK_VOL5:
-                list_add (&vb->list, &ldb->v_volu);
-                break;
-        case VBLK_CMP3:
-                list_add (&vb->list, &ldb->v_comp);
-                break;
-        case VBLK_PRT3:
-                /* Sort by the partition's start sector. */
-                list_for_each (item, &ldb->v_part) {
-                        struct vblk *v = list_entry (item, struct vblk, list);
-                        if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) &&
-                            (v->vblk.part.start > vb->vblk.part.start)) {
-                                list_add_tail (&vb->list, &v->list);
-                                return true;
-                        }
-                }
-                list_add_tail (&vb->list, &ldb->v_part);
-                break;
-        }
-        return true;
-}
-/**
- * ldm_frag_add - Add a VBLK fragment to a list
- * @data:   Raw fragment to be added to the list
- * @size:   Size of the raw fragment
- * @frags:  Linked list of VBLK fragments
- *
- * Fragmented VBLKs may not be consecutive in the database, so they are placed
- * in a list so they can be pieced together later.
- *
- * Return:  'true'   Success, the VBLK was added to the list
- *          'false'  Error, a problem occurred
- */
-static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
-{
-        struct frag *f;
-        struct list_head *item;
-        int rec, num, group;
-        BUG_ON (!data || !frags);
-        if (size < 2 * VBLK_SIZE_HEAD) {
-                ldm_error("Value of size is to small.");
-                return false;
-        }
-        group = get_unaligned_be32(data + 0x08);
-        rec   = get_unaligned_be16(data + 0x0C);
-        num   = get_unaligned_be16(data + 0x0E);
-        if ((num < 1) || (num > 4)) {
-                ldm_error ("A VBLK claims to have %d parts.", num);
-                return false;
-        }
-        if (rec >= num) {
-                ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
-                return false;
-        }
-        list_for_each (item, frags) {
-                f = list_entry (item, struct frag, list);
-                if (f->group == group)
-                        goto found;
-        }
-        f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL);
-        if (!f) {
-                ldm_crit ("Out of memory.");
-                return false;
-        }
-        f->group = group;
-        f->num   = num;
-        f->rec   = rec;
-        f->map   = 0xFF << num;
-        list_add_tail (&f->list, frags);
-found:
-        if (rec >= f->num) {
-                ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
-                return false;
-        }
-        if (f->map & (1 << rec)) {
-                ldm_error ("Duplicate VBLK, part %d.", rec);
-                f->map &= 0x7F;                 /* Mark the group as broken */
-                return false;
-        }
-        f->map |= (1 << rec);
-        if (!rec)
-                memcpy(f->data, data, VBLK_SIZE_HEAD);
-        data += VBLK_SIZE_HEAD;
-        size -= VBLK_SIZE_HEAD;
-        memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size);
-        return true;
-}
-/**
- * ldm_frag_free - Free a linked list of VBLK fragments
- * @list:  Linked list of fragments
- *
- * Free a linked list of VBLK fragments
- *
- * Return:  none
- */
-static void ldm_frag_free (struct list_head *list)
-{
-        struct list_head *item, *tmp;
-        BUG_ON (!list);
-        list_for_each_safe (item, tmp, list)
-                kfree (list_entry (item, struct frag, list));
-}
-/**
- * ldm_frag_commit - Validate fragmented VBLKs and add them to the database
- * @frags:  Linked list of VBLK fragments
- * @ldb:    Cache of the database structures
- *
- * Now that all the fragmented VBLKs have been collected, they must be added to
- * the database for later use.
- *
- * Return:  'true'   All the fragments we added successfully
- *          'false'  One or more of the fragments we invalid
- */
-static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
-{
-        struct frag *f;
-        struct list_head *item;
-        BUG_ON (!frags || !ldb);
-        list_for_each (item, frags) {
-                f = list_entry (item, struct frag, list);
-                if (f->map != 0xFF) {
-                        ldm_error ("VBLK group %d is incomplete (0x%02x).",
-                                f->group, f->map);
-                        return false;
-                }
-                if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb))
-                        return false;           /* Already logged */
-        }
-        return true;
-}
-/**
- * ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @state->bdev, of the database
- * @ldb:   Cache of the database structures
- *
- * To use the information from the VBLKs, they need to be read from the disk,
- * unpacked and validated.  We cache them in @ldb according to their type.
- *
- * Return:  'true'   All the VBLKs were read successfully
- *          'false'  An error occurred
- */
-static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
-                          struct ldmdb *ldb)
-{
-        int size, perbuf, skip, finish, s, v, recs;
-        u8 *data = NULL;
-        Sector sect;
-        bool result = false;
-        LIST_HEAD (frags);
-        BUG_ON(!state || !ldb);
-        size   = ldb->vm.vblk_size;
-        perbuf = 512 / size;
-        skip   = ldb->vm.vblk_offset >> 9;              /* Bytes to sectors */
-        finish = (size * ldb->vm.last_vblk_seq) >> 9;
-        for (s = skip; s < finish; s++) {               /* For each sector */
-                data = read_part_sector(state, base + OFF_VMDB + s, &sect);
-                if (!data) {
-                        ldm_crit ("Disk read failed.");
-                        goto out;
-                }
-                for (v = 0; v < perbuf; v++, data+=size) {  /* For each vblk */
-                        if (MAGIC_VBLK != get_unaligned_be32(data)) {
-                                ldm_error ("Expected to find a VBLK.");
-                                goto out;
-                        }
-                        recs = get_unaligned_be16(data + 0x0E); /* Number of records */
-                        if (recs == 1) {
-                                if (!ldm_ldmdb_add (data, size, ldb))
-                                        goto out;       /* Already logged */
-                        } else if (recs > 1) {
-                                if (!ldm_frag_add (data, size, &frags))
-                                        goto out;       /* Already logged */
-                        }
-                        /* else Record is not in use, ignore it. */
-                }
-                put_dev_sector (sect);
-                data = NULL;
-        }
-        result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */
-out:
-        if (data)
-                put_dev_sector (sect);
-        ldm_frag_free (&frags);
-        return result;
-}
-/**
- * ldm_free_vblks - Free a linked list of vblk's
- * @lh:  Head of a linked list of struct vblk
- *
- * Free a list of vblk's and free the memory used to maintain the list.
- *
- * Return:  none
- */
-static void ldm_free_vblks (struct list_head *lh)
-{
-        struct list_head *item, *tmp;
-        BUG_ON (!lh);
-        list_for_each_safe (item, tmp, lh)
-                kfree (list_entry (item, struct vblk, list));
-}
-/**
- * ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @state: Partition check state including device holding the LDM Database
- *
- * This determines whether the device @bdev is a dynamic disk and if so creates
- * the partitions necessary in the gendisk structure pointed to by @hd.
- *
- * We create a dummy device 1, which contains the LDM database, and then create
- * each partition described by the LDM database in sequence as devices 2+. For
- * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
- * and so on: the actual data containing partitions.
- *
- * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
- *          0 Success, @state->bdev is not a dynamic disk
- *         -1 An error occurred before enough information had been read
- *            Or @state->bdev is a dynamic disk, but it may be corrupted
- */
-int ldm_partition(struct parsed_partitions *state)
-{
-        struct ldmdb  *ldb;
-        unsigned long base;
-        int result = -1;
-        BUG_ON(!state);
-        /* Look for signs of a Dynamic Disk */
-        if (!ldm_validate_partition_table(state))
-                return 0;
-        ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
-        if (!ldb) {
-                ldm_crit ("Out of memory.");
-                goto out;
-        }
-        /* Parse and check privheads. */
-        if (!ldm_validate_privheads(state, &ldb->ph))
-                goto out;               /* Already logged */
-        /* All further references are relative to base (database start). */
-        base = ldb->ph.config_start;
-        /* Parse and check tocs and vmdb. */
-        if (!ldm_validate_tocblocks(state, base, ldb) ||
-            !ldm_validate_vmdb(state, base, ldb))
-                goto out;               /* Already logged */
-        /* Initialize vblk lists in ldmdb struct */
-        INIT_LIST_HEAD (&ldb->v_dgrp);
-        INIT_LIST_HEAD (&ldb->v_disk);
-        INIT_LIST_HEAD (&ldb->v_volu);
-        INIT_LIST_HEAD (&ldb->v_comp);
-        INIT_LIST_HEAD (&ldb->v_part);
-        if (!ldm_get_vblks(state, base, ldb)) {
-                ldm_crit ("Failed to read the VBLKs from the database.");
-                goto cleanup;
-        }
-        /* Finally, create the data partition devices. */
-        if (ldm_create_data_partitions(state, ldb)) {
-                ldm_debug ("Parsed LDM database successfully.");
-                result = 1;
-        }
-        /* else Already logged */
-cleanup:
-        ldm_free_vblks (&ldb->v_dgrp);
-        ldm_free_vblks (&ldb->v_disk);
-        ldm_free_vblks (&ldb->v_volu);
-        ldm_free_vblks (&ldb->v_comp);
-        ldm_free_vblks (&ldb->v_part);
-out:
-        kfree (ldb);
-        return result;
-}
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
deleted file mode 100644
index 374242c0971..00000000000
--- a/block/partitions/ldm.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/**
- * ldm - Part of the Linux-NTFS project.
- *
- * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
- *
- * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS source
- * in the file COPYING); if not, write to the Free Software Foundation,
- * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#ifndef _FS_PT_LDM_H_
-#define _FS_PT_LDM_H_
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/genhd.h>
-#include <linux/fs.h>
-#include <asm/unaligned.h>
-#include <asm/byteorder.h>
-struct parsed_partitions;
-/* Magic numbers in CPU format. */
-#define MAGIC_VMDB      0x564D4442              /* VMDB */
-#define MAGIC_VBLK      0x56424C4B              /* VBLK */
-#define MAGIC_PRIVHEAD  0x5052495648454144ULL   /* PRIVHEAD */
-#define MAGIC_TOCBLOCK  0x544F43424C4F434BULL   /* TOCBLOCK */
-/* The defined vblk types. */
-#define VBLK_VOL5               0x51            /* Volume,     version 5 */
-#define VBLK_CMP3               0x32            /* Component,  version 3 */
-#define VBLK_PRT3               0x33            /* Partition,  version 3 */
-#define VBLK_DSK3               0x34            /* Disk,       version 3 */
-#define VBLK_DSK4               0x44            /* Disk,       version 4 */
-#define VBLK_DGR3               0x35            /* Disk Group, version 3 */
-#define VBLK_DGR4               0x45            /* Disk Group, version 4 */
-/* vblk flags indicating extra information will be present */
-#define VBLK_FLAG_COMP_STRIPE   0x10
-#define VBLK_FLAG_PART_INDEX    0x08
-#define VBLK_FLAG_DGR3_IDS      0x08
-#define VBLK_FLAG_DGR4_IDS      0x08
-#define VBLK_FLAG_VOLU_ID1      0x08
-#define VBLK_FLAG_VOLU_ID2      0x20
-#define VBLK_FLAG_VOLU_SIZE     0x80
-#define VBLK_FLAG_VOLU_DRIVE    0x02
-/* size of a vblk's static parts */
-#define VBLK_SIZE_HEAD          16
-#define VBLK_SIZE_CMP3          22              /* Name and version */
-#define VBLK_SIZE_DGR3          12
-#define VBLK_SIZE_DGR4          44
-#define VBLK_SIZE_DSK3          12
-#define VBLK_SIZE_DSK4          45
-#define VBLK_SIZE_PRT3          28
-#define VBLK_SIZE_VOL5          58
-/* component types */
-#define COMP_STRIPE             0x01            /* Stripe-set */
-#define COMP_BASIC              0x02            /* Basic disk */
-#define COMP_RAID               0x03            /* Raid-set */
-/* Other constants. */
-#define LDM_DB_SIZE             2048            /* Size in sectors (= 1MiB). */
-#define OFF_PRIV1               6               /* Offset of the first privhead
-                                                   relative to the start of the
-                                                   device in sectors */
-/* Offsets to structures within the LDM Database in sectors. */
-#define OFF_PRIV2               1856            /* Backup private headers. */
-#define OFF_PRIV3               2047
-#define OFF_TOCB1               1               /* Tables of contents. */
-#define OFF_TOCB2               2
-#define OFF_TOCB3               2045
-#define OFF_TOCB4               2046
-#define OFF_VMDB                17              /* List of partitions. */
-#define LDM_PARTITION           0x42            /* Formerly SFS (Landis). */
-#define TOC_BITMAP1             "config"        /* Names of the two defined */
-#define TOC_BITMAP2             "log"           /* bitmaps in the TOCBLOCK. */
-/* Borrowed from msdos.c */
-#define SYS_IND(p)              (get_unaligned(&(p)->sys_ind))
-struct frag {                           /* VBLK Fragment handling */
-        struct list_head list;
-        u32             group;
-        u8              num;            /* Total number of records */
-        u8              rec;            /* This is record number n */
-        u8              map;            /* Which portions are in use */
-        u8              data[0];
-};
-/* In memory LDM database structures. */
-#define GUID_SIZE               16
-struct privhead {                       /* Offsets and sizes are in sectors. */
-        u16     ver_major;
-        u16     ver_minor;
-        u64     logical_disk_start;
-        u64     logical_disk_size;
-        u64     config_start;
-        u64     config_size;
-        u8      disk_id[GUID_SIZE];
-};
-struct tocblock {                       /* We have exactly two bitmaps. */
-        u8      bitmap1_name[16];
-        u64     bitmap1_start;
-        u64     bitmap1_size;
-        u8      bitmap2_name[16];
-        u64     bitmap2_start;
-        u64     bitmap2_size;
-};
-struct vmdb {                           /* VMDB: The database header */
-        u16     ver_major;
-        u16     ver_minor;
-        u32     vblk_size;
-        u32     vblk_offset;
-        u32     last_vblk_seq;
-};
-struct vblk_comp {                      /* VBLK Component */
-        u8      state[16];
-        u64     parent_id;
-        u8      type;
-        u8      children;
-        u16     chunksize;
-};
-struct vblk_dgrp {                      /* VBLK Disk Group */
-        u8      disk_id[64];
-};
-struct vblk_disk {                      /* VBLK Disk */
-        u8      disk_id[GUID_SIZE];
-        u8      alt_name[128];
-};
-struct vblk_part {                      /* VBLK Partition */
-        u64     start;
-        u64     size;                   /* start, size and vol_off in sectors */
-        u64     volume_offset;
-        u64     parent_id;
-        u64     disk_id;
-        u8      partnum;
-};
-struct vblk_volu {                      /* VBLK Volume */
-        u8      volume_type[16];
-        u8      volume_state[16];
-        u8      guid[16];
-        u8      drive_hint[4];
-        u64     size;
-        u8      partition_type;
-};
-struct vblk_head {                      /* VBLK standard header */
-        u32 group;
-        u16 rec;
-        u16 nrec;
-};
-struct vblk {                           /* Generalised VBLK */
-        u8      name[64];
-        u64     obj_id;
-        u32     sequence;
-        u8      flags;
-        u8      type;
-        union {
-                struct vblk_comp comp;
-                struct vblk_dgrp dgrp;
-                struct vblk_disk disk;
-                struct vblk_part part;
-                struct vblk_volu volu;
-        } vblk;
-        struct list_head list;
-};
-struct ldmdb {                          /* Cache of the database */
-        struct privhead ph;
-        struct tocblock toc;
-        struct vmdb     vm;
-        struct list_head v_dgrp;
-        struct list_head v_disk;
-        struct list_head v_volu;
-        struct list_head v_comp;
-        struct list_head v_part;
-};
-int ldm_partition(struct parsed_partitions *state);
-#endif /* _FS_PT_LDM_H_ */
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
deleted file mode 100644
index 11f688bd76c..00000000000
--- a/block/partitions/mac.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- *  fs/partitions/mac.c
- *
- *  Code extracted from drivers/block/genhd.c
- *  Copyright (C) 1991-1998  Linus Torvalds
- *  Re-organised Feb 1998 Russell King
- */
-#include <linux/ctype.h>
-#include "check.h"
-#include "mac.h"
-#ifdef CONFIG_PPC_PMAC
-#include <asm/machdep.h>
-extern void note_bootable_part(dev_t dev, int part, int goodness);
-#endif
-/*
- * Code to understand MacOS partition tables.
- */
-static inline void mac_fix_string(char *stg, int len)
-{
-        int i;
-        for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
-                stg[i] = 0;
-}
-int mac_partition(struct parsed_partitions *state)
-{
-        Sector sect;
-        unsigned char *data;
-        int slot, blocks_in_map;
-        unsigned secsize;
-#ifdef CONFIG_PPC_PMAC
-        int found_root = 0;
-        int found_root_goodness = 0;
-#endif
-        struct mac_partition *part;
-        struct mac_driver_desc *md;
-        /* Get 0th block and look at the first partition map entry. */
-        md = read_part_sector(state, 0, &sect);
-        if (!md)
-                return -1;
-        if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        secsize = be16_to_cpu(md->block_size);
-        put_dev_sector(sect);
-        data = read_part_sector(state, secsize/512, &sect);
-        if (!data)
-                return -1;
-        part = (struct mac_partition *) (data + secsize%512);
-        if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
-                put_dev_sector(sect);
-                return 0;               /* not a MacOS disk */
-        }
-        blocks_in_map = be32_to_cpu(part->map_count);
-        if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
-        for (slot = 1; slot <= blocks_in_map; ++slot) {
-                int pos = slot * secsize;
-                put_dev_sector(sect);
-                data = read_part_sector(state, pos/512, &sect);
-                if (!data)
-                        return -1;
-                part = (struct mac_partition *) (data + pos%512);
-                if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC)
-                        break;
-                put_partition(state, slot,
-                        be32_to_cpu(part->start_block) * (secsize/512),
-                        be32_to_cpu(part->block_count) * (secsize/512));
-                if (!strnicmp(part->type, "Linux_RAID", 10))
-                        state->parts[slot].flags = ADDPART_FLAG_RAID;
-#ifdef CONFIG_PPC_PMAC
-                /*
-                 * If this is the first bootable partition, tell the
-                 * setup code, in case it wants to make this the root.
-                 */
-                if (machine_is(powermac)) {
-                        int goodness = 0;
-                        mac_fix_string(part->processor, 16);
-                        mac_fix_string(part->name, 32);
-                        mac_fix_string(part->type, 32);                                 
-                    
-                        if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE)
-                            && strcasecmp(part->processor, "powerpc") == 0)
-                                goodness++;
-                        if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0
-                            || (strnicmp(part->type, "Linux", 5) == 0
-                                && strcasecmp(part->type, "Linux_swap") != 0)) {
-                                int i, l;
-                                goodness++;
-                                l = strlen(part->name);
-                                if (strcmp(part->name, "/") == 0)
-                                        goodness++;
-                                for (i = 0; i <= l - 4; ++i) {
-                                        if (strnicmp(part->name + i, "root",
-                                                     4) == 0) {
-                                                goodness += 2;
-                                                break;
-                                        }
-                                }
-                                if (strnicmp(part->name, "swap", 4) == 0)
-                                        goodness--;
-                        }
-                        if (goodness > found_root_goodness) {
-                                found_root = slot;
-                                found_root_goodness = goodness;
-                        }
-                }
-#endif /* CONFIG_PPC_PMAC */
-        }
-#ifdef CONFIG_PPC_PMAC
-        if (found_root_goodness)
-                note_bootable_part(state->bdev->bd_dev, found_root,
-                                   found_root_goodness);
-#endif
-        put_dev_sector(sect);
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        return 1;
-}
diff --git a/block/partitions/mac.h b/block/partitions/mac.h
deleted file mode 100644
index 3c7d9843638..00000000000
--- a/block/partitions/mac.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- *  fs/partitions/mac.h
- */
-#define MAC_PARTITION_MAGIC     0x504d
-/* type field value for A/UX or other Unix partitions */
-#define APPLE_AUX_TYPE  "Apple_UNIX_SVR2"
-struct mac_partition {
-        __be16  signature;      /* expected to be MAC_PARTITION_MAGIC */
-        __be16  res1;
-        __be32  map_count;      /* # blocks in partition map */
-        __be32  start_block;    /* absolute starting block # of partition */
-        __be32  block_count;    /* number of blocks in partition */
-        char    name[32];       /* partition name */
-        char    type[32];       /* string type description */
-        __be32  data_start;     /* rel block # of first data block */
-        __be32  data_count;     /* number of data blocks */
-        __be32  status;         /* partition status bits */
-        __be32  boot_start;
-        __be32  boot_size;
-        __be32  boot_load;
-        __be32  boot_load2;
-        __be32  boot_entry;
-        __be32  boot_entry2;
-        __be32  boot_cksum;
-        char    processor[16];  /* identifies ISA of boot */
-        /* there is more stuff after this that we don't need */
-};
-#define MAC_STATUS_BOOTABLE     8       /* partition is bootable */
-#define MAC_DRIVER_MAGIC        0x4552
-/* Driver descriptor structure, in block 0 */
-struct mac_driver_desc {
-        __be16  signature;      /* expected to be MAC_DRIVER_MAGIC */
-        __be16  block_size;
-        __be32  block_count;
-    /* ... more stuff */
-};
-int mac_partition(struct parsed_partitions *state);
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
deleted file mode 100644
index 8752a5d2656..00000000000
--- a/block/partitions/msdos.c
+++ /dev/null
@@ -1,569 +0,0 @@
-/*
- *  fs/partitions/msdos.c
- *
- *  Code extracted from drivers/block/genhd.c
- *  Copyright (C) 1991-1998  Linus Torvalds
- *
- *  Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
- *  in the early extended-partition checks and added DM partitions
- *
- *  Support for DiskManager v6.0x added by Mark Lord,
- *  with information provided by OnTrack.  This now works for linux fdisk
- *  and LILO, as well as loadlin and bootln.  Note that disks other than
- *  /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1).
- *
- *  More flexible handling of extended partitions - aeb, 950831
- *
- *  Check partition table on IDE disks for common CHS translations
- *
- *  Re-organised Feb 1998 Russell King
- */
-#include <linux/msdos_fs.h>
-#include "check.h"
-#include "msdos.h"
-#include "efi.h"
-/*
- * Many architectures don't like unaligned accesses, while
- * the nr_sects and start_sect partition table entries are
- * at a 2 (mod 4) address.
- */
-#include <asm/unaligned.h>
-#define SYS_IND(p)      get_unaligned(&p->sys_ind)
-static inline sector_t nr_sects(struct partition *p)
-{
-        return (sector_t)get_unaligned_le32(&p->nr_sects);
-}
-static inline sector_t start_sect(struct partition *p)
-{
-        return (sector_t)get_unaligned_le32(&p->start_sect);
-}
-static inline int is_extended_partition(struct partition *p)
-{
-        return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
-                SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
-                SYS_IND(p) == LINUX_EXTENDED_PARTITION);
-}
-#define MSDOS_LABEL_MAGIC1      0x55
-#define MSDOS_LABEL_MAGIC2      0xAA
-static inline int
-msdos_magic_present(unsigned char *p)
-{
-        return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
-}
-/* Value is EBCDIC 'IBMA' */
-#define AIX_LABEL_MAGIC1        0xC9
-#define AIX_LABEL_MAGIC2        0xC2
-#define AIX_LABEL_MAGIC3        0xD4
-#define AIX_LABEL_MAGIC4        0xC1
-static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
-{
-        struct partition *pt = (struct partition *) (p + 0x1be);
-        Sector sect;
-        unsigned char *d;
-        int slot, ret = 0;
-        if (!(p[0] == AIX_LABEL_MAGIC1 &&
-                p[1] == AIX_LABEL_MAGIC2 &&
-                p[2] == AIX_LABEL_MAGIC3 &&
-                p[3] == AIX_LABEL_MAGIC4))
-                return 0;
-        /* Assume the partition table is valid if Linux partitions exists */
-        for (slot = 1; slot <= 4; slot++, pt++) {
-                if (pt->sys_ind == LINUX_SWAP_PARTITION ||
-                        pt->sys_ind == LINUX_RAID_PARTITION ||
-                        pt->sys_ind == LINUX_DATA_PARTITION ||
-                        pt->sys_ind == LINUX_LVM_PARTITION ||
-                        is_extended_partition(pt))
-                        return 0;
-        }
-        d = read_part_sector(state, 7, &sect);
-        if (d) {
-                if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
-                        ret = 1;
-                put_dev_sector(sect);
-        };
-        return ret;
-}
-static void set_info(struct parsed_partitions *state, int slot,
-                     u32 disksig)
-{
-        struct partition_meta_info *info = &state->parts[slot].info;
-        snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
-                 slot);
-        info->volname[0] = 0;
-        state->parts[slot].has_info = true;
-}
-/*
- * Create devices for each logical partition in an extended partition.
- * The logical partitions form a linked list, with each entry being
- * a partition table with two entries.  The first entry
- * is the real data partition (with a start relative to the partition
- * table start).  The second is a pointer to the next logical partition
- * (with a start relative to the entire extended partition).
- * We do not create a Linux partition for the partition tables, but
- * only for the actual data partitions.
- */
-static void parse_extended(struct parsed_partitions *state,
-                           sector_t first_sector, sector_t first_size,
-                           u32 disksig)
-{
-        struct partition *p;
-        Sector sect;
-        unsigned char *data;
-        sector_t this_sector, this_size;
-        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
-        int loopct = 0;         /* number of links followed
-                                   without finding a data partition */
-        int i;
-        this_sector = first_sector;
-        this_size = first_size;
-        while (1) {
-                if (++loopct > 100)
-                        return;
-                if (state->next == state->limit)
-                        return;
-                data = read_part_sector(state, this_sector, &sect);
-                if (!data)
-                        return;
-                if (!msdos_magic_present(data + 510))
-                        goto done; 
-                p = (struct partition *) (data + 0x1be);
-                /*
-                 * Usually, the first entry is the real data partition,
-                 * the 2nd entry is the next extended partition, or empty,
-                 * and the 3rd and 4th entries are unused.
-                 * However, DRDOS sometimes has the extended partition as
-                 * the first entry (when the data partition is empty),
-                 * and OS/2 seems to use all four entries.
-                 */
-                /* 
-                 * First process the data partition(s)
-                 */
-                for (i=0; i<4; i++, p++) {
-                        sector_t offs, size, next;
-                        if (!nr_sects(p) || is_extended_partition(p))
-                                continue;
-                        /* Check the 3rd and 4th entries -
-                           these sometimes contain random garbage */
-                        offs = start_sect(p)*sector_size;
-                        size = nr_sects(p)*sector_size;
-                        next = this_sector + offs;
-                        if (i >= 2) {
-                                if (offs + size > this_size)
-                                        continue;
-                                if (next < first_sector)
-                                        continue;
-                                if (next + size > first_sector + first_size)
-                                        continue;
-                        }
-                        put_partition(state, state->next, next, size);
-                        set_info(state, state->next, disksig);
-                        if (SYS_IND(p) == LINUX_RAID_PARTITION)
-                                state->parts[state->next].flags = ADDPART_FLAG_RAID;
-                        loopct = 0;
-                        if (++state->next == state->limit)
-                                goto done;
-                }
-                /*
-                 * Next, process the (first) extended partition, if present.
-                 * (So far, there seems to be no reason to make
-                 *  parse_extended()  recursive and allow a tree
-                 *  of extended partitions.)
-                 * It should be a link to the next logical partition.
-                 */
-                p -= 4;
-                for (i=0; i<4; i++, p++)
-                        if (nr_sects(p) && is_extended_partition(p))
-                                break;
-                if (i == 4)
-                        goto done;       /* nothing left to do */
-                this_sector = first_sector + start_sect(p) * sector_size;
-                this_size = nr_sects(p) * sector_size;
-                put_dev_sector(sect);
-        }
-done:
-        put_dev_sector(sect);
-}
-/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
-   indicates linux swap.  Be careful before believing this is Solaris. */
-static void parse_solaris_x86(struct parsed_partitions *state,
-                              sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_SOLARIS_X86_PARTITION
-        Sector sect;
-        struct solaris_x86_vtoc *v;
-        int i;
-        short max_nparts;
-        v = read_part_sector(state, offset + 1, &sect);
-        if (!v)
-                return;
-        if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
-                put_dev_sector(sect);
-                return;
-        }
-        {
-                char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
-                snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        }
-        if (le32_to_cpu(v->v_version) != 1) {
-                char tmp[64];
-                snprintf(tmp, sizeof(tmp), "  cannot handle version %d vtoc>\n",
-                         le32_to_cpu(v->v_version));
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                put_dev_sector(sect);
-                return;
-        }
-        /* Ensure we can handle previous case of VTOC with 8 entries gracefully */
-        max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
-        for (i=0; i<max_nparts && state->next<state->limit; i++) {
-                struct solaris_x86_slice *s = &v->v_slice[i];
-                char tmp[3 + 10 + 1 + 1];
-                if (s->s_size == 0)
-                        continue;
-                snprintf(tmp, sizeof(tmp), " [s%d]", i);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                /* solaris partitions are relative to current MS-DOS
-                 * one; must add the offset of the current partition */
-                put_partition(state, state->next++,
-                                 le32_to_cpu(s->s_start)+offset,
-                                 le32_to_cpu(s->s_size));
-        }
-        put_dev_sector(sect);
-        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-#endif
-}
-#if defined(CONFIG_BSD_DISKLABEL)
-/* 
- * Create devices for BSD partitions listed in a disklabel, under a
- * dos-like partition. See parse_extended() for more information.
- */
-static void parse_bsd(struct parsed_partitions *state,
-                      sector_t offset, sector_t size, int origin, char *flavour,
-                      int max_partitions)
-{
-        Sector sect;
-        struct bsd_disklabel *l;
-        struct bsd_partition *p;
-        char tmp[64];
-        l = read_part_sector(state, offset + 1, &sect);
-        if (!l)
-                return;
-        if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
-                put_dev_sector(sect);
-                return;
-        }
-        snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
-        strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        if (le16_to_cpu(l->d_npartitions) < max_partitions)
-                max_partitions = le16_to_cpu(l->d_npartitions);
-        for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
-                sector_t bsd_start, bsd_size;
-                if (state->next == state->limit)
-                        break;
-                if (p->p_fstype == BSD_FS_UNUSED) 
-                        continue;
-                bsd_start = le32_to_cpu(p->p_offset);
-                bsd_size = le32_to_cpu(p->p_size);
-                if (offset == bsd_start && size == bsd_size)
-                        /* full parent partition, we have it already */
-                        continue;
-                if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
-                        strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
-                        continue;
-                }
-                put_partition(state, state->next++, bsd_start, bsd_size);
-        }
-        put_dev_sector(sect);
-        if (le16_to_cpu(l->d_npartitions) > max_partitions) {
-                snprintf(tmp, sizeof(tmp), " (ignored %d more)",
-                         le16_to_cpu(l->d_npartitions) - max_partitions);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        }
-        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-}
-#endif
-static void parse_freebsd(struct parsed_partitions *state,
-                          sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
-#endif
-}
-static void parse_netbsd(struct parsed_partitions *state,
-                         sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
-#endif
-}
-static void parse_openbsd(struct parsed_partitions *state,
-                          sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, offset, size, origin, "openbsd",
-                  OPENBSD_MAXPARTITIONS);
-#endif
-}
-/*
- * Create devices for Unixware partitions listed in a disklabel, under a
- * dos-like partition. See parse_extended() for more information.
- */
-static void parse_unixware(struct parsed_partitions *state,
-                           sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_UNIXWARE_DISKLABEL
-        Sector sect;
-        struct unixware_disklabel *l;
-        struct unixware_slice *p;
-        l = read_part_sector(state, offset + 29, &sect);
-        if (!l)
-                return;
-        if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
-            le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) {
-                put_dev_sector(sect);
-                return;
-        }
-        {
-                char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
-                snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        }
-        p = &l->vtoc.v_slice[1];
-        /* I omit the 0th slice as it is the same as whole disk. */
-        while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
-                if (state->next == state->limit)
-                        break;
-                if (p->s_label != UNIXWARE_FS_UNUSED)
-                        put_partition(state, state->next++,
-                                      le32_to_cpu(p->start_sect),
-                                      le32_to_cpu(p->nr_sects));
-                p++;
-        }
-        put_dev_sector(sect);
-        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-#endif
-}
-/*
- * Minix 2.0.0/2.0.2 subpartition support.
- * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
- * Rajeev V. Pillai    <rajeevvp@yahoo.com>
- */
-static void parse_minix(struct parsed_partitions *state,
-                        sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_MINIX_SUBPARTITION
-        Sector sect;
-        unsigned char *data;
-        struct partition *p;
-        int i;
-        data = read_part_sector(state, offset, &sect);
-        if (!data)
-                return;
-        p = (struct partition *)(data + 0x1be);
-        /* The first sector of a Minix partition can have either
-         * a secondary MBR describing its subpartitions, or
-         * the normal boot sector. */
-        if (msdos_magic_present (data + 510) &&
-            SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
-                char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
-                snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
-                strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
-                        if (state->next == state->limit)
-                                break;
-                        /* add each partition in use */
-                        if (SYS_IND(p) == MINIX_PARTITION)
-                                put_partition(state, state->next++,
-                                              start_sect(p), nr_sects(p));
-                }
-                strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-        }
-        put_dev_sector(sect);
-#endif /* CONFIG_MINIX_SUBPARTITION */
-}
-static struct {
-        unsigned char id;
-        void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
-} subtypes[] = {
-        {FREEBSD_PARTITION, parse_freebsd},
-        {NETBSD_PARTITION, parse_netbsd},
-        {OPENBSD_PARTITION, parse_openbsd},
-        {MINIX_PARTITION, parse_minix},
-        {UNIXWARE_PARTITION, parse_unixware},
-        {SOLARIS_X86_PARTITION, parse_solaris_x86},
-        {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
-        {0, NULL},
-};
- 
-int msdos_partition(struct parsed_partitions *state)
-{
-        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
-        Sector sect;
-        unsigned char *data;
-        struct partition *p;
-        struct fat_boot_sector *fb;
-        int slot;
-        u32 disksig;
-        data = read_part_sector(state, 0, &sect);
-        if (!data)
-                return -1;
-        if (!msdos_magic_present(data + 510)) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        if (aix_magic_present(state, data)) {
-                put_dev_sector(sect);
-                strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
-                return 0;
-        }
-        /*
-         * Now that the 55aa signature is present, this is probably
-         * either the boot sector of a FAT filesystem or a DOS-type
-         * partition table. Reject this in case the boot indicator
-         * is not 0 or 0x80.
-         */
-        p = (struct partition *) (data + 0x1be);
-        for (slot = 1; slot <= 4; slot++, p++) {
-                if (p->boot_ind != 0 && p->boot_ind != 0x80) {
-                        /*
-                         * Even without a valid boot inidicator value
-                         * its still possible this is valid FAT filesystem
-                         * without a partition table.
-                         */
-                        fb = (struct fat_boot_sector *) data;
-                        if (slot == 1 && fb->reserved && fb->fats
-                                && fat_valid_media(fb->media)) {
-                                strlcat(state->pp_buf, "\n", PAGE_SIZE);
-                                put_dev_sector(sect);
-                                return 1;
-                        } else {
-                                put_dev_sector(sect);
-                                return 0;
-                        }
-                }
-        }
-#ifdef CONFIG_EFI_PARTITION
-        p = (struct partition *) (data + 0x1be);
-        for (slot = 1 ; slot <= 4 ; slot++, p++) {
-                /* If this is an EFI GPT disk, msdos should ignore it. */
-                if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
-                        put_dev_sector(sect);
-                        return 0;
-                }
-        }
-#endif
-        p = (struct partition *) (data + 0x1be);
-        disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
-        /*
-         * Look for partitions in two passes:
-         * First find the primary and DOS-type extended partitions.
-         * On the second pass look inside *BSD, Unixware and Solaris partitions.
-         */
-        state->next = 5;
-        for (slot = 1 ; slot <= 4 ; slot++, p++) {
-                sector_t start = start_sect(p)*sector_size;
-                sector_t size = nr_sects(p)*sector_size;
-                if (!size)
-                        continue;
-                if (is_extended_partition(p)) {
-                        /*
-                         * prevent someone doing mkfs or mkswap on an
-                         * extended partition, but leave room for LILO
-                         * FIXME: this uses one logical sector for > 512b
-                         * sector, although it may not be enough/proper.
-                         */
-                        sector_t n = 2;
-                        n = min(size, max(sector_size, n));
-                        put_partition(state, slot, start, n);
-                        strlcat(state->pp_buf, " <", PAGE_SIZE);
-                        parse_extended(state, start, size, disksig);
-                        strlcat(state->pp_buf, " >", PAGE_SIZE);
-                        continue;
-                }
-                put_partition(state, slot, start, size);
-                set_info(state, slot, disksig);
-                if (SYS_IND(p) == LINUX_RAID_PARTITION)
-                        state->parts[slot].flags = ADDPART_FLAG_RAID;
-                if (SYS_IND(p) == DM6_PARTITION)
-                        strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
-                if (SYS_IND(p) == EZD_PARTITION)
-                        strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        /* second pass - output for each on a separate line */
-        p = (struct partition *) (0x1be + data);
-        for (slot = 1 ; slot <= 4 ; slot++, p++) {
-                unsigned char id = SYS_IND(p);
-                int n;
-                if (!nr_sects(p))
-                        continue;
-                for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
-                        ;
-                if (!subtypes[n].parse)
-                        continue;
-                subtypes[n].parse(state, start_sect(p) * sector_size,
-                                  nr_sects(p) * sector_size, slot);
-        }
-        put_dev_sector(sect);
-        return 1;
-}
diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h
deleted file mode 100644
index 38c781c490b..00000000000
--- a/block/partitions/msdos.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- *  fs/partitions/msdos.h
- */
-#define MSDOS_LABEL_MAGIC               0xAA55
-int msdos_partition(struct parsed_partitions *state);
diff --git a/block/partitions/osf.c b/block/partitions/osf.c
deleted file mode 100644
index 764b86a0196..00000000000
--- a/block/partitions/osf.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  fs/partitions/osf.c
- *
- *  Code extracted from drivers/block/genhd.c
- *
- *  Copyright (C) 1991-1998  Linus Torvalds
- *  Re-organised Feb 1998 Russell King
- */
-#include "check.h"
-#include "osf.h"
-#define MAX_OSF_PARTITIONS 18
-int osf_partition(struct parsed_partitions *state)
-{
-        int i;
-        int slot = 1;
-        unsigned int npartitions;
-        Sector sect;
-        unsigned char *data;
-        struct disklabel {
-                __le32 d_magic;
-                __le16 d_type,d_subtype;
-                u8 d_typename[16];
-                u8 d_packname[16];
-                __le32 d_secsize;
-                __le32 d_nsectors;
-                __le32 d_ntracks;
-                __le32 d_ncylinders;
-                __le32 d_secpercyl;
-                __le32 d_secprtunit;
-                __le16 d_sparespertrack;
-                __le16 d_sparespercyl;
-                __le32 d_acylinders;
-                __le16 d_rpm, d_interleave, d_trackskew, d_cylskew;
-                __le32 d_headswitch, d_trkseek, d_flags;
-                __le32 d_drivedata[5];
-                __le32 d_spare[5];
-                __le32 d_magic2;
-                __le16 d_checksum;
-                __le16 d_npartitions;
-                __le32 d_bbsize, d_sbsize;
-                struct d_partition {
-                        __le32 p_size;
-                        __le32 p_offset;
-                        __le32 p_fsize;
-                        u8  p_fstype;
-                        u8  p_frag;
-                        __le16 p_cpg;
-                } d_partitions[MAX_OSF_PARTITIONS];
-        } * label;
-        struct d_partition * partition;
-        data = read_part_sector(state, 0, &sect);
-        if (!data)
-                return -1;
-        label = (struct disklabel *) (data+64);
-        partition = label->d_partitions;
-        if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        npartitions = le16_to_cpu(label->d_npartitions);
-        if (npartitions > MAX_OSF_PARTITIONS) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        for (i = 0 ; i < npartitions; i++, partition++) {
-                if (slot == state->limit)
-                        break;
-                if (le32_to_cpu(partition->p_size))
-                        put_partition(state, slot,
-                                le32_to_cpu(partition->p_offset),
-                                le32_to_cpu(partition->p_size));
-                slot++;
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        put_dev_sector(sect);
-        return 1;
-}
diff --git a/block/partitions/osf.h b/block/partitions/osf.h
deleted file mode 100644
index 20ed2315ec1..00000000000
--- a/block/partitions/osf.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/*
- *  fs/partitions/osf.h
- */
-#define DISKLABELMAGIC (0x82564557UL)
-int osf_partition(struct parsed_partitions *state);
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
deleted file mode 100644
index ea8a86dceaf..00000000000
--- a/block/partitions/sgi.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  fs/partitions/sgi.c
- *
- *  Code extracted from drivers/block/genhd.c
- */
-#include "check.h"
-#include "sgi.h"
-struct sgi_disklabel {
-        __be32 magic_mushroom;          /* Big fat spliff... */
-        __be16 root_part_num;           /* Root partition number */
-        __be16 swap_part_num;           /* Swap partition number */
-        s8 boot_file[16];               /* Name of boot file for ARCS */
-        u8 _unused0[48];                /* Device parameter useless crapola.. */
-        struct sgi_volume {
-                s8 name[8];             /* Name of volume */
-                __be32 block_num;               /* Logical block number */
-                __be32 num_bytes;               /* How big, in bytes */
-        } volume[15];
-        struct sgi_partition {
-                __be32 num_blocks;              /* Size in logical blocks */
-                __be32 first_block;     /* First logical block */
-                __be32 type;            /* Type of this partition */
-        } partitions[16];
-        __be32 csum;                    /* Disk label checksum */
-        __be32 _unused1;                        /* Padding */
-};
-int sgi_partition(struct parsed_partitions *state)
-{
-        int i, csum;
-        __be32 magic;
-        int slot = 1;
-        unsigned int start, blocks;
-        __be32 *ui, cs;
-        Sector sect;
-        struct sgi_disklabel *label;
-        struct sgi_partition *p;
-        char b[BDEVNAME_SIZE];
-        label = read_part_sector(state, 0, &sect);
-        if (!label)
-                return -1;
-        p = &label->partitions[0];
-        magic = label->magic_mushroom;
-        if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
-                /*printk("Dev %s SGI disklabel: bad magic %08x\n",
-                       bdevname(bdev, b), be32_to_cpu(magic));*/
-                put_dev_sector(sect);
-                return 0;
-        }
-        ui = ((__be32 *) (label + 1)) - 1;
-        for(csum = 0; ui >= ((__be32 *) label);) {
-                cs = *ui--;
-                csum += be32_to_cpu(cs);
-        }
-        if(csum) {
-                printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
-                       bdevname(state->bdev, b));
-                put_dev_sector(sect);
-                return 0;
-        }
-        /* All SGI disk labels have 16 partitions, disks under Linux only
-         * have 15 minor's.  Luckily there are always a few zero length
-         * partitions which we don't care about so we never overflow the
-         * current_minor.
-         */
-        for(i = 0; i < 16; i++, p++) {
-                blocks = be32_to_cpu(p->num_blocks);
-                start  = be32_to_cpu(p->first_block);
-                if (blocks) {
-                        put_partition(state, slot, start, blocks);
-                        if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION)
-                                state->parts[slot].flags = ADDPART_FLAG_RAID;
-                }
-                slot++;
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        put_dev_sector(sect);
-        return 1;
-}
diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h
deleted file mode 100644
index b9553ebdd5a..00000000000
--- a/block/partitions/sgi.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- *  fs/partitions/sgi.h
- */
-extern int sgi_partition(struct parsed_partitions *state);
-#define SGI_LABEL_MAGIC 0x0be5a941
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
deleted file mode 100644
index b5b6fcfb3d3..00000000000
--- a/block/partitions/sun.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- *  fs/partitions/sun.c
- *
- *  Code extracted from drivers/block/genhd.c
- *
- *  Copyright (C) 1991-1998  Linus Torvalds
- *  Re-organised Feb 1998 Russell King
- */
-#include "check.h"
-#include "sun.h"
-int sun_partition(struct parsed_partitions *state)
-{
-        int i;
-        __be16 csum;
-        int slot = 1;
-        __be16 *ush;
-        Sector sect;
-        struct sun_disklabel {
-                unsigned char info[128];   /* Informative text string */
-                struct sun_vtoc {
-                    __be32 version;     /* Layout version */
-                    char   volume[8];   /* Volume name */
-                    __be16 nparts;      /* Number of partitions */
-                    struct sun_info {           /* Partition hdrs, sec 2 */
-                        __be16 id;
-                        __be16 flags;
-                    } infos[8];
-                    __be16 padding;     /* Alignment padding */
-                    __be32 bootinfo[3];  /* Info needed by mboot */
-                    __be32 sanity;       /* To verify vtoc sanity */
-                    __be32 reserved[10]; /* Free space */
-                    __be32 timestamp[8]; /* Partition timestamp */
-                } vtoc;
-                __be32 write_reinstruct; /* sectors to skip, writes */
-                __be32 read_reinstruct;  /* sectors to skip, reads */
-                unsigned char spare[148]; /* Padding */
-                __be16 rspeed;     /* Disk rotational speed */
-                __be16 pcylcount;  /* Physical cylinder count */
-                __be16 sparecyl;   /* extra sects per cylinder */
-                __be16 obs1;       /* gap1 */
-                __be16 obs2;       /* gap2 */
-                __be16 ilfact;     /* Interleave factor */
-                __be16 ncyl;       /* Data cylinder count */
-                __be16 nacyl;      /* Alt. cylinder count */
-                __be16 ntrks;      /* Tracks per cylinder */
-                __be16 nsect;      /* Sectors per track */
-                __be16 obs3;       /* bhead - Label head offset */
-                __be16 obs4;       /* ppart - Physical Partition */
-                struct sun_partition {
-                        __be32 start_cylinder;
-                        __be32 num_sectors;
-                } partitions[8];
-                __be16 magic;      /* Magic number */
-                __be16 csum;       /* Label xor'd checksum */
-        } * label;
-        struct sun_partition *p;
-        unsigned long spc;
-        char b[BDEVNAME_SIZE];
-        int use_vtoc;
-        int nparts;
-        label = read_part_sector(state, 0, &sect);
-        if (!label)
-                return -1;
-        p = label->partitions;
-        if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
-/*              printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
-                       bdevname(bdev, b), be16_to_cpu(label->magic)); */
-                put_dev_sector(sect);
-                return 0;
-        }
-        /* Look at the checksum */
-        ush = ((__be16 *) (label+1)) - 1;
-        for (csum = 0; ush >= ((__be16 *) label);)
-                csum ^= *ush--;
-        if (csum) {
-                printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
-                       bdevname(state->bdev, b));
-                put_dev_sector(sect);
-                return 0;
-        }
-        /* Check to see if we can use the VTOC table */
-        use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) &&
-                    (be32_to_cpu(label->vtoc.version) == 1) &&
-                    (be16_to_cpu(label->vtoc.nparts) <= 8));
-        /* Use 8 partition entries if not specified in validated VTOC */
-        nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8;
-        /*
-         * So that old Linux-Sun partitions continue to work,
-         * alow the VTOC to be used under the additional condition ...
-         */
-        use_vtoc = use_vtoc || !(label->vtoc.sanity ||
-                                 label->vtoc.version || label->vtoc.nparts);
-        spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
-        for (i = 0; i < nparts; i++, p++) {
-                unsigned long st_sector;
-                unsigned int num_sectors;
-                st_sector = be32_to_cpu(p->start_cylinder) * spc;
-                num_sectors = be32_to_cpu(p->num_sectors);
-                if (num_sectors) {
-                        put_partition(state, slot, st_sector, num_sectors);
-                        state->parts[slot].flags = 0;
-                        if (use_vtoc) {
-                                if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION)
-                                        state->parts[slot].flags |= ADDPART_FLAG_RAID;
-                                else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK)
-                                        state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK;
-                        }
-                }
-                slot++;
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        put_dev_sector(sect);
-        return 1;
-}
diff --git a/block/partitions/sun.h b/block/partitions/sun.h
deleted file mode 100644
index 2424baa8319..00000000000
--- a/block/partitions/sun.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- *  fs/partitions/sun.h
- */
-#define SUN_LABEL_MAGIC          0xDABE
-#define SUN_VTOC_SANITY          0x600DDEEE
-int sun_partition(struct parsed_partitions *state);
diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c
deleted file mode 100644
index 9627ccffc1c..00000000000
--- a/block/partitions/sysv68.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- *  fs/partitions/sysv68.c
- *
- *  Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
- */
-#include "check.h"
-#include "sysv68.h"
-/*
- *      Volume ID structure: on first 256-bytes sector of disk
- */
-struct volumeid {
-        u8      vid_unused[248];
-        u8      vid_mac[8];     /* ASCII string "MOTOROLA" */
-};
-/*
- *      config block: second 256-bytes sector on disk
- */
-struct dkconfig {
-        u8      ios_unused0[128];
-        __be32  ios_slcblk;     /* Slice table block number */
-        __be16  ios_slccnt;     /* Number of entries in slice table */
-        u8      ios_unused1[122];
-};
-/*
- *      combined volumeid and dkconfig block
- */
-struct dkblk0 {
-        struct volumeid dk_vid;
-        struct dkconfig dk_ios;
-};
-/*
- *      Slice Table Structure
- */
-struct slice {
-        __be32  nblocks;                /* slice size (in blocks) */
-        __be32  blkoff;                 /* block offset of slice */
-};
-int sysv68_partition(struct parsed_partitions *state)
-{
-        int i, slices;
-        int slot = 1;
-        Sector sect;
-        unsigned char *data;
-        struct dkblk0 *b;
-        struct slice *slice;
-        char tmp[64];
-        data = read_part_sector(state, 0, &sect);
-        if (!data)
-                return -1;
-        b = (struct dkblk0 *)data;
-        if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
-                put_dev_sector(sect);
-                return 0;
-        }
-        slices = be16_to_cpu(b->dk_ios.ios_slccnt);
-        i = be32_to_cpu(b->dk_ios.ios_slcblk);
-        put_dev_sector(sect);
-        data = read_part_sector(state, i, &sect);
-        if (!data)
-                return -1;
-        slices -= 1; /* last slice is the whole disk */
-        snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
-        strlcat(state->pp_buf, tmp, PAGE_SIZE);
-        slice = (struct slice *)data;
-        for (i = 0; i < slices; i++, slice++) {
-                if (slot == state->limit)
-                        break;
-                if (be32_to_cpu(slice->nblocks)) {
-                        put_partition(state, slot,
-                                be32_to_cpu(slice->blkoff),
-                                be32_to_cpu(slice->nblocks));
-                        snprintf(tmp, sizeof(tmp), "(s%u)", i);
-                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
-                }
-                slot++;
-        }
-        strlcat(state->pp_buf, "\n", PAGE_SIZE);
-        put_dev_sector(sect);
-        return 1;
-}
diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h
deleted file mode 100644
index bf2f5ffa97a..00000000000
--- a/block/partitions/sysv68.h
+++ /dev/null
@@ -1 +0,0 @@
-extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c
deleted file mode 100644
index 8dbaf9f77a9..00000000000
--- a/block/partitions/ultrix.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  fs/partitions/ultrix.c
- *
- *  Code extracted from drivers/block/genhd.c
- *
- *  Re-organised Jul 1999 Russell King
- */
-#include "check.h"
-#include "ultrix.h"
-int ultrix_partition(struct parsed_partitions *state)
-{
-        int i;
-        Sector sect;
-        unsigned char *data;
-        struct ultrix_disklabel {
-                s32     pt_magic;       /* magic no. indicating part. info exits */
-                s32     pt_valid;       /* set by driver if pt is current */
-                struct  pt_info {
-                        s32             pi_nblocks; /* no. of sectors */
-                        u32             pi_blkoff;  /* block offset for start */
-                } pt_part[8];
-        } *label;
-#define PT_MAGIC        0x032957        /* Partition magic number */
-#define PT_VALID        1               /* Indicates if struct is valid */
-        data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
-        if (!data)
-                return -1;
-        
-        label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label));
-        if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) {
-                for (i=0; i<8; i++)
-                        if (label->pt_part[i].pi_nblocks)
-                                put_partition(state, i+1, 
-                                              label->pt_part[i].pi_blkoff,
-                                              label->pt_part[i].pi_nblocks);
-                put_dev_sector(sect);
-                strlcat(state->pp_buf, "\n", PAGE_SIZE);
-                return 1;
-        } else {
-                put_dev_sector(sect);
-                return 0;
-        }
-}
diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h
deleted file mode 100644
index a3cc00b2bde..00000000000
--- a/block/partitions/ultrix.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/*
- *  fs/partitions/ultrix.h
- */
-int ultrix_partition(struct parsed_partitions *state);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9a87daa6f4f..4f4230b79bb 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -24,7 +24,6 @@
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/cdrom.h>
-#include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/times.h>
 #include <asm/uaccess.h>
@@ -566,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 {
        int err;
-        if (!q)
+        if (!q || blk_get_queue(q))
                return -ENXIO;
        switch (cmd) {
@@ -687,64 +686,11 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
                        err = -ENOTTY;
        }
+        blk_put_queue(q);
        return err;
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
-int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
-{
-        if (bd && bd == bd->bd_contains)
-                return 0;
-        /* Actually none of these is particularly useful on a partition,
-         * but they are safe.
-         */
-        switch (cmd) {
-        case SCSI_IOCTL_GET_IDLUN:
-        case SCSI_IOCTL_GET_BUS_NUMBER:
-        case SCSI_IOCTL_GET_PCI:
-        case SCSI_IOCTL_PROBE_HOST:
-        case SG_GET_VERSION_NUM:
-        case SG_SET_TIMEOUT:
-        case SG_GET_TIMEOUT:
-        case SG_GET_RESERVED_SIZE:
-        case SG_SET_RESERVED_SIZE:
-        case SG_EMULATED_HOST:
-                return 0;
-        case CDROM_GET_CAPABILITY:
-                /* Keep this until we remove the printk below.  udev sends it
-                 * and we do not want to spam dmesg about it.   CD-ROMs do
-                 * not have partitions, so we get here only for disks.
-                 */
-                return -ENOIOCTLCMD;
-        default:
-                break;
-        }
-        if (capable(CAP_SYS_RAWIO))
-                return 0;
-        /* In particular, rule out all resets and host-specific ioctls.  */
-        printk_ratelimited(KERN_WARNING
-                           "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
-        return -ENOIOCTLCMD;
-}
-EXPORT_SYMBOL(scsi_verify_blk_ioctl);
-int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
-                       unsigned int cmd, void __user *arg)
-{
-        int ret;
-        ret = scsi_verify_blk_ioctl(bd, cmd);
-        if (ret < 0)
-                return ret;
-        return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
-}
-EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
 static int __init blk_scsi_ioctl_init(void)
 {
        blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
commit	8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree	a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /block
parent	406089d01562f1e2bf9f089fd7637009ebaad589 (diff)