34 files changed, 1699 insertions, 333 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a42..48e0b21b0059 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
 You can do a very simple testing of running two dd threads in two different
 cgroups. Here is what you can do.
+- Enable Block IO controller
+        CONFIG_BLK_CGROUP=y
 - Enable group scheduling in CFQ
        CONFIG_CFQ_GROUP_IOSCHED=y
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
 Various user visible config options
 ===================================
-CONFIG_CFQ_GROUP_IOSCHED
-        - Enables group scheduling in CFQ. Currently only 1 level of group
-          creation is allowed.
-CONFIG_DEBUG_CFQ_IOSCHED
-        - Enables some debugging messages in blktrace. Also creates extra
-          cgroup file blkio.dequeue.
-Config options selected automatically
-=====================================
-These config options are not user visible and are selected/deselected
-automatically based on IO scheduler configuration.
 CONFIG_BLK_CGROUP
-        - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
+        - Block IO controller.
 CONFIG_DEBUG_BLK_CGROUP
-        - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
+        - Debug help. Right now some additional stats file show up in cgroup
+          if this option is enabled.
+CONFIG_CFQ_GROUP_IOSCHED
+        - Enables group scheduling in CFQ. Currently only 1 level of group
+          creation is allowed.
 Details of cgroup files
 =======================
 - blkio.weight
-        - Specifies per cgroup weight.
+        - Specifies per cgroup weight. This is default weight of the group
+          on all the devices until and unless overridden by per device rule.
+          (See blkio.weight_device).
          Currently allowed range of weights is from 100 to 1000.
+- blkio.weight_device
+        - One can specify per cgroup per device rules using this interface.
+          These rules override the default value of group weight as specified
+          by blkio.weight.
+          Following is the format.
+          #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
+          Configure weight=300 on /dev/sdb (8:16) in this cgroup
+          # echo 8:16 300 > blkio.weight_device
+          # cat blkio.weight_device
+          dev     weight
+          8:16    300
+          Configure weight=500 on /dev/sda (8:0) in this cgroup
+          # echo 8:0 500 > blkio.weight_device
+          # cat blkio.weight_device
+          dev     weight
+          8:0     500
+          8:16    300
+          Remove specific weight for /dev/sda in this cgroup
+          # echo 8:0 0 > blkio.weight_device
+          # cat blkio.weight_device
+          dev     weight
+          8:16    300
 - blkio.time
        - disk time allocated to cgroup per device in milliseconds. First
          two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
          third field specifies the number of sectors transferred by the
          group to/from the device.
+- blkio.io_service_bytes
+        - Number of bytes transferred to/from the disk by the group. These
+          are further divided by the type of operation - read or write, sync
+          or async. First two fields specify the major and minor number of the
+          device, third field specifies the operation type and the fourth field
+          specifies the number of bytes.
+- blkio.io_serviced
+        - Number of IOs completed to/from the disk by the group. These
+          are further divided by the type of operation - read or write, sync
+          or async. First two fields specify the major and minor number of the
+          device, third field specifies the operation type and the fourth field
+          specifies the number of IOs.
+- blkio.io_service_time
+        - Total amount of time between request dispatch and request completion
+          for the IOs done by this cgroup. This is in nanoseconds to make it
+          meaningful for flash devices too. For devices with queue depth of 1,
+          this time represents the actual service time. When queue_depth > 1,
+          that is no longer true as requests may be served out of order. This
+          may cause the service time for a given IO to include the service time
+          of multiple IOs when served out of order which may result in total
+          io_service_time > actual time elapsed. This time is further divided by
+          the type of operation - read or write, sync or async. First two fields
+          specify the major and minor number of the device, third field
+          specifies the operation type and the fourth field specifies the
+          io_service_time in ns.
+- blkio.io_wait_time
+        - Total amount of time the IOs for this cgroup spent waiting in the
+          scheduler queues for service. This can be greater than the total time
+          elapsed since it is cumulative io_wait_time for all IOs. It is not a
+          measure of total time the cgroup spent waiting but rather a measure of
+          the wait_time for its individual IOs. For devices with queue_depth > 1
+          this metric does not include the time spent waiting for service once
+          the IO is dispatched to the device but till it actually gets serviced
+          (there might be a time lag here due to re-ordering of requests by the
+          device). This is in nanoseconds to make it meaningful for flash
+          devices too. This time is further divided by the type of operation -
+          read or write, sync or async. First two fields specify the major and
+          minor number of the device, third field specifies the operation type
+          and the fourth field specifies the io_wait_time in ns.
+- blkio.io_merged
+        - Total number of bios/requests merged into requests belonging to this
+          cgroup. This is further divided by the type of operation - read or
+          write, sync or async.
+- blkio.io_queued
+        - Total number of requests queued up at any given instant for this
+          cgroup. This is further divided by the type of operation - read or
+          write, sync or async.
+- blkio.avg_queue_size
+        - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+          The average queue size for this cgroup over the entire time of this
+          cgroup's existence. Queue size samples are taken each time one of the
+          queues of this cgroup gets a timeslice.
+- blkio.group_wait_time
+        - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+          This is the amount of time the cgroup had to wait since it became busy
+          (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+          its queues. This is different from the io_wait_time which is the
+          cumulative total of the amount of time spent by each IO in that cgroup
+          waiting in the scheduler queue. This is in nanoseconds. If this is
+          read when the cgroup is in a waiting (for timeslice) state, the stat
+          will only report the group_wait_time accumulated till the last time it
+          got a timeslice and will not include the current delta.
+- blkio.empty_time
+        - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+          This is the amount of time a cgroup spends without any pending
+          requests when not being served, i.e., it does not include any time
+          spent idling for one of the queues of the cgroup. This is in
+          nanoseconds. If this is read when the cgroup is in an empty state,
+          the stat will only report the empty_time accumulated till the last
+          time it had a pending request and will not include the current delta.
+- blkio.idle_time
+        - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+          This is the amount of time spent by the IO scheduler idling for a
+          given cgroup in anticipation of a better request than the exising ones
+          from other queues/cgroups. This is in nanoseconds. If this is read
+          when the cgroup is in an idling state, the stat will only report the
+          idle_time accumulated till the last idle period and will not include
+          the current delta.
 - blkio.dequeue
-        - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
+        - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
          gives the statistics about how many a times a group was dequeued
          from service tree of the device. First two fields specify the major
          and minor number of the device and third field specifies the number
          of times a group was dequeued from a particular device.
+- blkio.reset_stats
+        - Writing an int to this file will result in resetting all the stats
+          for that cgroup.
 CFQ sysfs tunable
 =================
 /sys/block/<disk>/queue/iosched/group_isolation
diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4d94bb..9be0b56eaee1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
        T10/SCSI Data Integrity Field or the T13/ATA External Path
        Protection.  If in doubt, say N.
-config BLK_CGROUP
-        tristate "Block cgroup support"
-        depends on CGROUPS
-        depends on CFQ_GROUP_IOSCHED
-        default n
-        ---help---
-        Generic block IO controller cgroup interface. This is the common
-        cgroup interface which should be used by various IO controlling
-        policies.
-        Currently, CFQ IO scheduler uses it to recognize task groups and
-        control disk bandwidth allocation (proportional time slice allocation)
-        to such task groups.
-config DEBUG_BLK_CGROUP
-        bool
-        depends on BLK_CGROUP
-        default n
-        ---help---
-        Enable some debugging help. Currently it stores the cgroup path
-        in the blk group which can be used by cfq for tracing various
-        group related activity.
 endif # BLOCK
 config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fc71cf071fb2..3199b76f795d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
 config IOSCHED_CFQ
        tristate "CFQ I/O scheduler"
-        select BLK_CGROUP if CFQ_GROUP_IOSCHED
+        # If BLK_CGROUP is a module, CFQ has to be built as module.
+        depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
        default y
        ---help---
          The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
          This is the default I/O scheduler.
+          Note: If BLK_CGROUP=m, then CFQ can be built only as module.
 config CFQ_GROUP_IOSCHED
        bool "CFQ Group Scheduling support"
-        depends on IOSCHED_CFQ && CGROUPS
+        depends on IOSCHED_CFQ && BLK_CGROUP
        default n
        ---help---
          Enable group IO scheduling in CFQ.
-config DEBUG_CFQ_IOSCHED
-        bool "Debug CFQ Scheduling"
-        depends on CFQ_GROUP_IOSCHED
-        select DEBUG_BLK_CGROUP
-        default n
-        ---help---
-          Enable CFQ IO scheduling debugging in CFQ. Currently it makes
-          blktrace output more verbose.
 choice
        prompt "Default I/O scheduler"
        default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6e..0bb499a739cd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-                        blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+                        blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 obj-$(CONFIG_BLK_DEV_BSG)       += bsg.o
 obj-$(CONFIG_BLK_CGROUP)        += blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677f..0d710c9d403b 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        }
+        if (bio->bi_private)
-        complete(bio->bi_private);
+                complete(bio->bi_private);
+        bio_put(bio);
 }
 /**
 * blkdev_issue_flush - queue a flush
 * @bdev:       blockdev to issue flush for
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
 * @error_sector:       error sector
+ * @flags:      BLKDEV_IFL_* flags to control behaviour
 *
 * Description:
 *    Issue a flush for the block device in question. Caller can supply
 *    room for storing the error offset in case of a flush error, if they
- *    wish to.
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
 */
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+                sector_t *error_sector, unsigned long flags)
 {
        DECLARE_COMPLETION_ONSTACK(wait);
        struct request_queue *q;
        struct bio *bio;
-        int ret;
+        int ret = 0;
        if (bdev->bd_disk == NULL)
                return -ENXIO;
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
        if (!q)
                return -ENXIO;
-        bio = bio_alloc(GFP_KERNEL, 0);
+        bio = bio_alloc(gfp_mask, 0);
        bio->bi_end_io = bio_end_empty_barrier;
-        bio->bi_private = &wait;
        bio->bi_bdev = bdev;
-        submit_bio(WRITE_BARRIER, bio);
+        if (test_bit(BLKDEV_WAIT, &flags))
+                bio->bi_private = &wait;
-        wait_for_completion(&wait);
-        /*
+        bio_get(bio);
-         * The driver must store the error location in ->bi_sector, if
+        submit_bio(WRITE_BARRIER, bio);
-         * it supports it. For non-stacked drivers, this should be copied
+        if (test_bit(BLKDEV_WAIT, &flags)) {
-         * from blk_rq_pos(rq).
+                wait_for_completion(&wait);
-         */
+                /*
-        if (error_sector)
+                 * The driver must store the error location in ->bi_sector, if
-                *error_sector = bio->bi_sector;
+                 * it supports it. For non-stacked drivers, this should be
+                 * copied from blk_rq_pos(rq).
+                 */
+                if (error_sector)
+                        *error_sector = bio->bi_sector;
+        }
-        ret = 0;
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
        else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
-        if (err) {
-                if (err == -EOPNOTSUPP)
-                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-                clear_bit(BIO_UPTODATE, &bio->bi_flags);
-        }
-        if (bio->bi_private)
-                complete(bio->bi_private);
-        __free_page(bio_page(bio));
-        bio_put(bio);
-}
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev:       blockdev to issue discard for
- * @sector:     start sector
- * @nr_sects:   number of sectors to discard
- * @gfp_mask:   memory allocation flags (for bio_alloc)
- * @flags:      DISCARD_FL_* flags to control behaviour
- *
- * Description:
- *    Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-                sector_t nr_sects, gfp_t gfp_mask, int flags)
-{
-        DECLARE_COMPLETION_ONSTACK(wait);
-        struct request_queue *q = bdev_get_queue(bdev);
-        int type = flags & DISCARD_FL_BARRIER ?
-                DISCARD_BARRIER : DISCARD_NOBARRIER;
-        struct bio *bio;
-        struct page *page;
-        int ret = 0;
-        if (!q)
-                return -ENXIO;
-        if (!blk_queue_discard(q))
-                return -EOPNOTSUPP;
-        while (nr_sects && !ret) {
-                unsigned int sector_size = q->limits.logical_block_size;
-                unsigned int max_discard_sectors =
-                        min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-                bio = bio_alloc(gfp_mask, 1);
-                if (!bio)
-                        goto out;
-                bio->bi_sector = sector;
-                bio->bi_end_io = blkdev_discard_end_io;
-                bio->bi_bdev = bdev;
-                if (flags & DISCARD_FL_WAIT)
-                        bio->bi_private = &wait;
-                /*
-                 * Add a zeroed one-sector payload as that's what
-                 * our current implementations need.  If we'll ever need
-                 * more the interface will need revisiting.
-                 */
-                page = alloc_page(gfp_mask | __GFP_ZERO);
-                if (!page)
-                        goto out_free_bio;
-                if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
-                        goto out_free_page;
-                /*
-                 * And override the bio size - the way discard works we
-                 * touch many more blocks on disk than the actual payload
-                 * length.
-                 */
-                if (nr_sects > max_discard_sectors) {
-                        bio->bi_size = max_discard_sectors << 9;
-                        nr_sects -= max_discard_sectors;
-                        sector += max_discard_sectors;
-                } else {
-                        bio->bi_size = nr_sects << 9;
-                        nr_sects = 0;
-                }
-                bio_get(bio);
-                submit_bio(type, bio);
-                if (flags & DISCARD_FL_WAIT)
-                        wait_for_completion(&wait);
-                if (bio_flagged(bio, BIO_EOPNOTSUPP))
-                        ret = -EOPNOTSUPP;
-                else if (!bio_flagged(bio, BIO_UPTODATE))
-                        ret = -EIO;
-                bio_put(bio);
-        }
-        return ret;
-out_free_page:
-        __free_page(page);
-out_free_bio:
-        bio_put(bio);
-out:
-        return -ENOMEM;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5fe03def34b2..d02bbf88de13 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,8 +15,12 @@
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/blkdev.h>
 #include <linux/slab.h>
 #include "blk-cgroup.h"
+#include <linux/genhd.h>
+#define MAX_KEY_LEN 100
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = {
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
+                                            struct blkio_policy_node *pn)
+{
+        list_add(&pn->node, &blkcg->policy_list);
+}
+/* Must be called with blkcg->lock held */
+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
+{
+        list_del(&pn->node);
+}
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+{
+        struct blkio_policy_node *pn;
+        list_for_each_entry(pn, &blkcg->policy_list, node) {
+                if (pn->dev == dev)
+                        return pn;
+        }
+        return NULL;
+}
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+/*
-                        unsigned long time, unsigned long sectors)
+ * Add to the appropriate stat variable depending on the request type.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+                                bool sync)
+{
+        if (direction)
+                stat[BLKIO_STAT_WRITE] += add;
+        else
+                stat[BLKIO_STAT_READ] += add;
+        if (sync)
+                stat[BLKIO_STAT_SYNC] += add;
+        else
+                stat[BLKIO_STAT_ASYNC] += add;
+}
+/*
+ * Decrements the appropriate stat variable if non-zero depending on the
+ * request type. Panics on value being zero.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
+{
+        if (direction) {
+                BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
+                stat[BLKIO_STAT_WRITE]--;
+        } else {
+                BUG_ON(stat[BLKIO_STAT_READ] == 0);
+                stat[BLKIO_STAT_READ]--;
+        }
+        if (sync) {
+                BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
+                stat[BLKIO_STAT_SYNC]--;
+        } else {
+                BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
+                stat[BLKIO_STAT_ASYNC]--;
+        }
+}
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+                                                struct blkio_group *curr_blkg)
+{
+        if (blkio_blkg_waiting(&blkg->stats))
+                return;
+        if (blkg == curr_blkg)
+                return;
+        blkg->stats.start_group_wait_time = sched_clock();
+        blkio_mark_blkg_waiting(&blkg->stats);
+}
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+{
+        unsigned long long now;
+        if (!blkio_blkg_waiting(stats))
+                return;
+        now = sched_clock();
+        if (time_after64(now, stats->start_group_wait_time))
+                stats->group_wait_time += now - stats->start_group_wait_time;
+        blkio_clear_blkg_waiting(stats);
+}
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
+{
+        unsigned long long now;
+        if (!blkio_blkg_empty(stats))
+                return;
+        now = sched_clock();
+        if (time_after64(now, stats->start_empty_time))
+                stats->empty_time += now - stats->start_empty_time;
+        blkio_clear_blkg_empty(stats);
+}
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        BUG_ON(blkio_blkg_idling(&blkg->stats));
+        blkg->stats.start_idle_time = sched_clock();
+        blkio_mark_blkg_idling(&blkg->stats);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+        unsigned long flags;
+        unsigned long long now;
+        struct blkio_group_stats *stats;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        if (blkio_blkg_idling(stats)) {
+                now = sched_clock();
+                if (time_after64(now, stats->start_idle_time))
+                        stats->idle_time += now - stats->start_idle_time;
+                blkio_clear_blkg_idling(stats);
+        }
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+{
+        unsigned long flags;
+        struct blkio_group_stats *stats;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        stats->avg_queue_size_sum +=
+                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+        stats->avg_queue_size_samples++;
+        blkio_update_group_wait_time(stats);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
+{
+        unsigned long flags;
+        struct blkio_group_stats *stats;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
+                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
+                spin_unlock_irqrestore(&blkg->stats_lock, flags);
+                return;
+        }
+        /*
+         * group is already marked empty. This can happen if cfqq got new
+         * request in parent group and moved to this group while being added
+         * to service tree. Just ignore the event and move on.
+         */
+        if(blkio_blkg_empty(stats)) {
+                spin_unlock_irqrestore(&blkg->stats_lock, flags);
+                return;
+        }
+        stats->start_empty_time = sched_clock();
+        blkio_mark_blkg_empty(stats);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+                        unsigned long dequeue)
+{
+        blkg->stats.dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
+#else
+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+                                        struct blkio_group *curr_blkg) {}
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
+#endif
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+                        struct blkio_group *curr_blkg, bool direction,
+                        bool sync)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+                        sync);
+        blkio_end_empty_time(&blkg->stats);
+        blkio_set_start_group_wait_time(blkg, curr_blkg);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                                bool direction, bool sync)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+                                        direction, sync);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        blkg->stats.time += time;
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+                                uint64_t bytes, bool direction, bool sync)
+{
+        struct blkio_group_stats *stats;
+        unsigned long flags;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        stats->sectors += bytes >> 9;
+        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+                        sync);
+        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+                        direction, sync);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+        uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+{
+        struct blkio_group_stats *stats;
+        unsigned long flags;
+        unsigned long long now = sched_clock();
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        stats = &blkg->stats;
+        if (time_after64(now, io_start_time))
+                blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+                                now - io_start_time, direction, sync);
+        if (time_after64(io_start_time, start_time))
+                blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+                                io_start_time - start_time, direction, sync);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+                                        bool sync)
 {
-        blkg->time += time;
+        unsigned long flags;
-        blkg->sectors += sectors;
+        spin_lock_irqsave(&blkg->stats_lock, flags);
+        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+                        sync);
+        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
                        struct blkio_group *blkg, void *key, dev_t dev)
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
        unsigned long flags;
        spin_lock_irqsave(&blkcg->lock, flags);
+        spin_lock_init(&blkg->stats_lock);
        rcu_assign_pointer(blkg->key, key);
        blkg->blkcg_id = css_id(&blkcg->css);
        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
        spin_unlock_irqrestore(&blkcg->lock, flags);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
        /* Need to take css reference ? */
        cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-#endif
        blkg->dev = dev;
 }
 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
@@ -154,6 +429,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
        struct blkio_group *blkg;
        struct hlist_node *n;
        struct blkio_policy_type *blkiop;
+        struct blkio_policy_node *pn;
        if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
                return -EINVAL;
@@ -162,7 +438,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
        spin_lock(&blkio_list_lock);
        spin_lock_irq(&blkcg->lock);
        blkcg->weight = (unsigned int)val;
        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                pn = blkio_policy_search_node(blkcg, blkg->dev);
+                if (pn)
+                        continue;
                list_for_each_entry(blkiop, &blkio_list, list)
                        blkiop->ops.blkio_update_group_weight_fn(blkg,
                                        blkcg->weight);
@@ -172,13 +454,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
        return 0;
 }
-#define SHOW_FUNCTION_PER_GROUP(__VAR)                                  \
+static int
+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+        struct blkio_cgroup *blkcg;
+        struct blkio_group *blkg;
+        struct blkio_group_stats *stats;
+        struct hlist_node *n;
+        uint64_t queued[BLKIO_STAT_TOTAL];
+        int i;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        bool idling, waiting, empty;
+        unsigned long long now = sched_clock();
+#endif
+        blkcg = cgroup_to_blkio_cgroup(cgroup);
+        spin_lock_irq(&blkcg->lock);
+        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                spin_lock(&blkg->stats_lock);
+                stats = &blkg->stats;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+                idling = blkio_blkg_idling(stats);
+                waiting = blkio_blkg_waiting(stats);
+                empty = blkio_blkg_empty(stats);
+#endif
+                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+                        queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
+                memset(stats, 0, sizeof(struct blkio_group_stats));
+                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+                        stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+                if (idling) {
+                        blkio_mark_blkg_idling(stats);
+                        stats->start_idle_time = now;
+                }
+                if (waiting) {
+                        blkio_mark_blkg_waiting(stats);
+                        stats->start_group_wait_time = now;
+                }
+                if (empty) {
+                        blkio_mark_blkg_empty(stats);
+                        stats->start_empty_time = now;
+                }
+#endif
+                spin_unlock(&blkg->stats_lock);
+        }
+        spin_unlock_irq(&blkcg->lock);
+        return 0;
+}
+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
+                                int chars_left, bool diskname_only)
+{
+        snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
+        chars_left -= strlen(str);
+        if (chars_left <= 0) {
+                printk(KERN_WARNING
+                        "Possibly incorrect cgroup stat display format");
+                return;
+        }
+        if (diskname_only)
+                return;
+        switch (type) {
+        case BLKIO_STAT_READ:
+                strlcat(str, " Read", chars_left);
+                break;
+        case BLKIO_STAT_WRITE:
+                strlcat(str, " Write", chars_left);
+                break;
+        case BLKIO_STAT_SYNC:
+                strlcat(str, " Sync", chars_left);
+                break;
+        case BLKIO_STAT_ASYNC:
+                strlcat(str, " Async", chars_left);
+                break;
+        case BLKIO_STAT_TOTAL:
+                strlcat(str, " Total", chars_left);
+                break;
+        default:
+                strlcat(str, " Invalid", chars_left);
+        }
+}
+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
+                                struct cgroup_map_cb *cb, dev_t dev)
+{
+        blkio_get_key_name(0, dev, str, chars_left, true);
+        cb->fill(cb, str, val);
+        return val;
+}
+/* This should be called with blkg->stats_lock held */
+static uint64_t blkio_get_stat(struct blkio_group *blkg,
+                struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
+{
+        uint64_t disk_total;
+        char key_str[MAX_KEY_LEN];
+        enum stat_sub_type sub_type;
+        if (type == BLKIO_STAT_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.time, cb, dev);
+        if (type == BLKIO_STAT_SECTORS)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
+                uint64_t sum = blkg->stats.avg_queue_size_sum;
+                uint64_t samples = blkg->stats.avg_queue_size_samples;
+                if (samples)
+                        do_div(sum, samples);
+                else
+                        sum = 0;
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
+        }
+        if (type == BLKIO_STAT_GROUP_WAIT_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.group_wait_time, cb, dev);
+        if (type == BLKIO_STAT_IDLE_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.idle_time, cb, dev);
+        if (type == BLKIO_STAT_EMPTY_TIME)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.empty_time, cb, dev);
+        if (type == BLKIO_STAT_DEQUEUE)
+                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                        blkg->stats.dequeue, cb, dev);
+#endif
+        for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+                        sub_type++) {
+                blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+                cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
+        }
+        disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
+                        blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+        blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+        cb->fill(cb, key_str, disk_total);
+        return disk_total;
+}
+#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)                \
 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
-                        struct cftype *cftype, struct seq_file *m)      \
+                struct cftype *cftype, struct cgroup_map_cb *cb)        \
 {                                                                       \
        struct blkio_cgroup *blkcg;                                     \
        struct blkio_group *blkg;                                       \
        struct hlist_node *n;                                           \
+        uint64_t cgroup_total = 0;                                      \
                                                                        \
        if (!cgroup_lock_live_group(cgroup))                            \
                return -ENODEV;                                         \
@@ -186,50 +609,295 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
        blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
        rcu_read_lock();                                                \
        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-                if (blkg->dev)                                          \
+                if (blkg->dev) {                                        \
-                        seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),  \
+                        spin_lock_irq(&blkg->stats_lock);               \
-                                 MINOR(blkg->dev), blkg->__VAR);        \
+                        cgroup_total += blkio_get_stat(blkg, cb,        \
+                                                blkg->dev, type);       \
+                        spin_unlock_irq(&blkg->stats_lock);             \
+                }                                                       \
        }                                                               \
+        if (show_total)                                                 \
+                cb->fill(cb, "Total", cgroup_total);                    \
        rcu_read_unlock();                                              \
        cgroup_unlock();                                                \
        return 0;                                                       \
 }
-SHOW_FUNCTION_PER_GROUP(time);
+SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(sectors);
+SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
+SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue);
+SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
+SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+static int blkio_check_dev_num(dev_t dev)
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
-                        unsigned long dequeue)
 {
-        blkg->dequeue += dequeue;
+        int part = 0;
+        struct gendisk *disk;
+        disk = get_gendisk(dev, &part);
+        if (!disk || part)
+                return -ENODEV;
+        return 0;
+}
+static int blkio_policy_parse_and_set(char *buf,
+                                      struct blkio_policy_node *newpn)
+{
+        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+        int ret;
+        unsigned long major, minor, temp;
+        int i = 0;
+        dev_t dev;
+        memset(s, 0, sizeof(s));
+        while ((p = strsep(&buf, " ")) != NULL) {
+                if (!*p)
+                        continue;
+                s[i++] = p;
+                /* Prevent from inputing too many things */
+                if (i == 3)
+                        break;
+        }
+        if (i != 2)
+                return -EINVAL;
+        p = strsep(&s[0], ":");
+        if (p != NULL)
+                major_s = p;
+        else
+                return -EINVAL;
+        minor_s = s[0];
+        if (!minor_s)
+                return -EINVAL;
+        ret = strict_strtoul(major_s, 10, &major);
+        if (ret)
+                return -EINVAL;
+        ret = strict_strtoul(minor_s, 10, &minor);
+        if (ret)
+                return -EINVAL;
+        dev = MKDEV(major, minor);
+        ret = blkio_check_dev_num(dev);
+        if (ret)
+                return ret;
+        newpn->dev = dev;
+        if (s[1] == NULL)
+                return -EINVAL;
+        ret = strict_strtoul(s[1], 10, &temp);
+        if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+            temp > BLKIO_WEIGHT_MAX)
+                return -EINVAL;
+        newpn->weight =  temp;
+        return 0;
+}
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+                              dev_t dev)
+{
+        struct blkio_policy_node *pn;
+        pn = blkio_policy_search_node(blkcg, dev);
+        if (pn)
+                return pn->weight;
+        else
+                return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+                                       const char *buffer)
+{
+        int ret = 0;
+        char *buf;
+        struct blkio_policy_node *newpn, *pn;
+        struct blkio_cgroup *blkcg;
+        struct blkio_group *blkg;
+        int keep_newpn = 0;
+        struct hlist_node *n;
+        struct blkio_policy_type *blkiop;
+        buf = kstrdup(buffer, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+        if (!newpn) {
+                ret = -ENOMEM;
+                goto free_buf;
+        }
+        ret = blkio_policy_parse_and_set(buf, newpn);
+        if (ret)
+                goto free_newpn;
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        spin_lock_irq(&blkcg->lock);
+        pn = blkio_policy_search_node(blkcg, newpn->dev);
+        if (!pn) {
+                if (newpn->weight != 0) {
+                        blkio_policy_insert_node(blkcg, newpn);
+                        keep_newpn = 1;
+                }
+                spin_unlock_irq(&blkcg->lock);
+                goto update_io_group;
+        }
+        if (newpn->weight == 0) {
+                /* weight == 0 means deleteing a specific weight */
+                blkio_policy_delete_node(pn);
+                spin_unlock_irq(&blkcg->lock);
+                goto update_io_group;
+        }
+        spin_unlock_irq(&blkcg->lock);
+        pn->weight = newpn->weight;
+update_io_group:
+        /* update weight for each cfqg */
+        spin_lock(&blkio_list_lock);
+        spin_lock_irq(&blkcg->lock);
+        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+                if (newpn->dev == blkg->dev) {
+                        list_for_each_entry(blkiop, &blkio_list, list)
+                                blkiop->ops.blkio_update_group_weight_fn(blkg,
+                                                         newpn->weight ?
+                                                         newpn->weight :
+                                                         blkcg->weight);
+                }
+        }
+        spin_unlock_irq(&blkcg->lock);
+        spin_unlock(&blkio_list_lock);
+free_newpn:
+        if (!keep_newpn)
+                kfree(newpn);
+free_buf:
+        kfree(buf);
+        return ret;
+}
+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
+                                      struct seq_file *m)
+{
+        struct blkio_cgroup *blkcg;
+        struct blkio_policy_node *pn;
+        seq_printf(m, "dev\tweight\n");
+        blkcg = cgroup_to_blkio_cgroup(cgrp);
+        if (list_empty(&blkcg->policy_list))
+                goto out;
+        spin_lock_irq(&blkcg->lock);
+        list_for_each_entry(pn, &blkcg->policy_list, node) {
+                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                           MINOR(pn->dev), pn->weight);
+        }
+        spin_unlock_irq(&blkcg->lock);
+out:
+        return 0;
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
-#endif
 struct cftype blkio_files[] = {
        {
+                .name = "weight_device",
+                .read_seq_string = blkiocg_weight_device_read,
+                .write_string = blkiocg_weight_device_write,
+                .max_write_len = 256,
+        },
+        {
                .name = "weight",
                .read_u64 = blkiocg_weight_read,
                .write_u64 = blkiocg_weight_write,
        },
        {
                .name = "time",
-                .read_seq_string = blkiocg_time_read,
+                .read_map = blkiocg_time_read,
        },
        {
                .name = "sectors",
-                .read_seq_string = blkiocg_sectors_read,
+                .read_map = blkiocg_sectors_read,
+        },
+        {
+                .name = "io_service_bytes",
+                .read_map = blkiocg_io_service_bytes_read,
+        },
+        {
+                .name = "io_serviced",
+                .read_map = blkiocg_io_serviced_read,
+        },
+        {
+                .name = "io_service_time",
+                .read_map = blkiocg_io_service_time_read,
+        },
+        {
+                .name = "io_wait_time",
+                .read_map = blkiocg_io_wait_time_read,
+        },
+        {
+                .name = "io_merged",
+                .read_map = blkiocg_io_merged_read,
+        },
+        {
+                .name = "io_queued",
+                .read_map = blkiocg_io_queued_read,
+        },
+        {
+                .name = "reset_stats",
+                .write_u64 = blkiocg_reset_stats,
        },
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       {
+        {
+                .name = "avg_queue_size",
+                .read_map = blkiocg_avg_queue_size_read,
+        },
+        {
+                .name = "group_wait_time",
+                .read_map = blkiocg_group_wait_time_read,
+        },
+        {
+                .name = "idle_time",
+                .read_map = blkiocg_idle_time_read,
+        },
+        {
+                .name = "empty_time",
+                .read_map = blkiocg_empty_time_read,
+        },
+        {
                .name = "dequeue",
-                .read_seq_string = blkiocg_dequeue_read,
+                .read_map = blkiocg_dequeue_read,
-       },
+        },
 #endif
 };
@@ -246,6 +914,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
        struct blkio_group *blkg;
        void *key;
        struct blkio_policy_type *blkiop;
+        struct blkio_policy_node *pn, *pntmp;
        rcu_read_lock();
 remove_entry:
@@ -276,7 +945,12 @@ remove_entry:
                blkiop->ops.blkio_unlink_group_fn(key, blkg);
        spin_unlock(&blkio_list_lock);
        goto remove_entry;
 done:
+        list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+                blkio_policy_delete_node(pn);
+                kfree(pn);
+        }
        free_css_id(&blkio_subsys, &blkcg->css);
        rcu_read_unlock();
        if (blkcg != &blkio_root_cgroup)
@@ -307,6 +981,7 @@ done:
        spin_lock_init(&blkcg->lock);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
+        INIT_LIST_HEAD(&blkcg->policy_list);
        return &blkcg->css;
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ccc20464dae..2b866ec1dcea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys;
 #define blkio_subsys_id blkio_subsys.subsys_id
 #endif
+enum stat_type {
+        /* Total time spent (in ns) between request dispatch to the driver and
+         * request completion for IOs doen by this cgroup. This may not be
+         * accurate when NCQ is turned on. */
+        BLKIO_STAT_SERVICE_TIME = 0,
+        /* Total bytes transferred */
+        BLKIO_STAT_SERVICE_BYTES,
+        /* Total IOs serviced, post merge */
+        BLKIO_STAT_SERVICED,
+        /* Total time spent waiting in scheduler queue in ns */
+        BLKIO_STAT_WAIT_TIME,
+        /* Number of IOs merged */
+        BLKIO_STAT_MERGED,
+        /* Number of IOs queued up */
+        BLKIO_STAT_QUEUED,
+        /* All the single valued stats go below this */
+        BLKIO_STAT_TIME,
+        BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        BLKIO_STAT_AVG_QUEUE_SIZE,
+        BLKIO_STAT_IDLE_TIME,
+        BLKIO_STAT_EMPTY_TIME,
+        BLKIO_STAT_GROUP_WAIT_TIME,
+        BLKIO_STAT_DEQUEUE
+#endif
+};
+enum stat_sub_type {
+        BLKIO_STAT_READ = 0,
+        BLKIO_STAT_WRITE,
+        BLKIO_STAT_SYNC,
+        BLKIO_STAT_ASYNC,
+        BLKIO_STAT_TOTAL
+};
+/* blkg state flags */
+enum blkg_state_flags {
+        BLKG_waiting = 0,
+        BLKG_idling,
+        BLKG_empty,
+};
 struct blkio_cgroup {
        struct cgroup_subsys_state css;
        unsigned int weight;
        spinlock_t lock;
        struct hlist_head blkg_list;
+        struct list_head policy_list; /* list of blkio_policy_node */
+};
+struct blkio_group_stats {
+        /* total disk time and nr sectors dispatched by this group */
+        uint64_t time;
+        uint64_t sectors;
+        uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        /* Sum of number of IOs queued across all samples */
+        uint64_t avg_queue_size_sum;
+        /* Count of samples taken for average */
+        uint64_t avg_queue_size_samples;
+        /* How many times this group has been removed from service tree */
+        unsigned long dequeue;
+        /* Total time spent waiting for it to be assigned a timeslice. */
+        uint64_t group_wait_time;
+        uint64_t start_group_wait_time;
+        /* Time spent idling for this blkio_group */
+        uint64_t idle_time;
+        uint64_t start_idle_time;
+        /*
+         * Total time when we have requests queued and do not contain the
+         * current active queue.
+         */
+        uint64_t empty_time;
+        uint64_t start_empty_time;
+        uint16_t flags;
+#endif
 };
 struct blkio_group {
@@ -35,20 +108,25 @@ struct blkio_group {
        void *key;
        struct hlist_node blkcg_node;
        unsigned short blkcg_id;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
        /* Store cgroup path */
        char path[128];
-        /* How many times this group has been removed from service tree */
-        unsigned long dequeue;
-#endif
        /* The device MKDEV(major, minor), this group has been created for */
-        dev_t   dev;
+        dev_t dev;
-        /* total disk time and nr sectors dispatched by this group */
+        /* Need to serialize the stats in the case of reset/update */
-        unsigned long time;
+        spinlock_t stats_lock;
-        unsigned long sectors;
+        struct blkio_group_stats stats;
 };
+struct blkio_policy_node {
+        struct list_head node;
+        dev_t dev;
+        unsigned int weight;
+};
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+                                     dev_t dev);
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
                                                unsigned int weight);
@@ -67,6 +145,11 @@ struct blkio_policy_type {
 extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+        return blkg->path;
+}
 #else
 struct blkio_group {
@@ -78,6 +161,8 @@ struct blkio_policy_type {
 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 #endif
 #define BLKIO_WEIGHT_MIN        100
@@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 #define BLKIO_WEIGHT_DEFAULT    500
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static inline char *blkg_path(struct blkio_group *blkg)
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
-{
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-        return blkg->path;
-}
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
                                unsigned long dequeue);
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg);
+#define BLKG_FLAG_FNS(name)                                             \
+static inline void blkio_mark_blkg_##name(                              \
+                struct blkio_group_stats *stats)                        \
+{                                                                       \
+        stats->flags |= (1 << BLKG_##name);                             \
+}                                                                       \
+static inline void blkio_clear_blkg_##name(                             \
+                struct blkio_group_stats *stats)                        \
+{                                                                       \
+        stats->flags &= ~(1 << BLKG_##name);                            \
+}                                                                       \
+static inline int blkio_blkg_##name(struct blkio_group_stats *stats)    \
+{                                                                       \
+        return (stats->flags & (1 << BLKG_##name)) != 0;                \
+}                                                                       \
+BLKG_FLAG_FNS(waiting)
+BLKG_FLAG_FNS(idling)
+BLKG_FLAG_FNS(empty)
+#undef BLKG_FLAG_FNS
 #else
-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+static inline void blkiocg_update_avg_queue_size_stats(
-static inline void blkiocg_update_blkio_group_dequeue_stats(
+                                                struct blkio_group *blkg) {}
-                        struct blkio_group *blkg, unsigned long dequeue) {}
+static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+                                                unsigned long dequeue) {}
+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{}
+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 #endif
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
                                                void *key);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                        unsigned long time, unsigned long sectors);
+                                        unsigned long time);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
+                                                bool direction, bool sync);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+        uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+                                        bool sync);
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+                struct blkio_group *curr_blkg, bool direction, bool sync);
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                        bool direction, bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-                        struct blkio_group *blkg, void *key, dev_t dev)
+                        struct blkio_group *blkg, void *key, dev_t dev) {}
-{
-}
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
-static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-                        unsigned long time, unsigned long sectors)
+                                                unsigned long time) {}
-{
+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-}
+                                uint64_t bytes, bool direction, bool sync) {}
+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
+                uint64_t start_time, uint64_t io_start_time, bool direction,
+                bool sync) {}
+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+                                                bool direction, bool sync) {}
+static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+                struct blkio_group *curr_blkg, bool direction, bool sync) {}
+static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                                bool direction, bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 9fe174dc74d1..e9a5ae25db8c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->tag = -1;
        rq->ref_count = 1;
        rq->start_time = jiffies;
+        set_start_time_ns(rq);
 }
 EXPORT_SYMBOL(blk_rq_init);
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
         */
        blk_sync_queue(q);
+        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
        mutex_lock(&q->sysfs_lock);
        queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
        mutex_unlock(&q->sysfs_lock);
@@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
                return NULL;
        }
+        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+                    laptop_mode_timer_fn, (unsigned long) q);
        init_timer(&q->unplug_timer);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
        INIT_LIST_HEAD(&q->timeout_list);
@@ -1198,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!blk_rq_cpu_valid(req))
                        req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
+                elv_bio_merged(q, req, bio);
                if (!attempt_back_merge(q, req))
                        elv_merged_request(q, req, el_ret);
                goto out;
@@ -1231,6 +1236,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!blk_rq_cpu_valid(req))
                        req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
+                elv_bio_merged(q, req, bio);
                if (!attempt_front_merge(q, req))
                        elv_merged_request(q, req, el_ret);
                goto out;
@@ -1855,8 +1861,10 @@ void blk_dequeue_request(struct request *rq)
         * and to it is freed is accounted as io that is in progress at
         * the driver side.
         */
-        if (blk_account_rq(rq))
+        if (blk_account_rq(rq)) {
                q->in_flight[rq_is_sync(rq)]++;
+                set_io_start_time_ns(rq);
+        }
 }
 /**
@@ -2098,7 +2106,7 @@ static void blk_finish_request(struct request *req, int error)
        BUG_ON(blk_queued_rq(req));
        if (unlikely(laptop_mode) && blk_fs_request(req))
-                laptop_io_completion();
+                laptop_io_completion(&req->q->backing_dev_info);
        blk_delete_timer(req);
@@ -2517,4 +2525,3 @@ int __init blk_dev_init(void)
        return 0;
 }
diff --git a/block/blk-lib.c b/block/blk-lib.c
new file mode 100644
index 000000000000..d0216b9f22d4
--- /dev/null
+++ b/block/blk-lib.c
@@ -0,0 +1,233 @@
+/*
+ * Functions related to generic helpers functions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include "blk.h"
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        }
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        __free_page(bio_page(bio));
+        bio_put(bio);
+}
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:       blockdev to issue discard for
+ * @sector:     start sector
+ * @nr_sects:   number of sectors to discard
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @flags:      BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+                sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        struct request_queue *q = bdev_get_queue(bdev);
+        int type = flags & BLKDEV_IFL_BARRIER ?
+                DISCARD_BARRIER : DISCARD_NOBARRIER;
+        struct bio *bio;
+        struct page *page;
+        int ret = 0;
+        if (!q)
+                return -ENXIO;
+        if (!blk_queue_discard(q))
+                return -EOPNOTSUPP;
+        while (nr_sects && !ret) {
+                unsigned int sector_size = q->limits.logical_block_size;
+                unsigned int max_discard_sectors =
+                        min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+                bio = bio_alloc(gfp_mask, 1);
+                if (!bio)
+                        goto out;
+                bio->bi_sector = sector;
+                bio->bi_end_io = blkdev_discard_end_io;
+                bio->bi_bdev = bdev;
+                if (flags & BLKDEV_IFL_WAIT)
+                        bio->bi_private = &wait;
+                /*
+                 * Add a zeroed one-sector payload as that's what
+                 * our current implementations need.  If we'll ever need
+                 * more the interface will need revisiting.
+                 */
+                page = alloc_page(gfp_mask | __GFP_ZERO);
+                if (!page)
+                        goto out_free_bio;
+                if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
+                        goto out_free_page;
+                /*
+                 * And override the bio size - the way discard works we
+                 * touch many more blocks on disk than the actual payload
+                 * length.
+                 */
+                if (nr_sects > max_discard_sectors) {
+                        bio->bi_size = max_discard_sectors << 9;
+                        nr_sects -= max_discard_sectors;
+                        sector += max_discard_sectors;
+                } else {
+                        bio->bi_size = nr_sects << 9;
+                        nr_sects = 0;
+                }
+                bio_get(bio);
+                submit_bio(type, bio);
+                if (flags & BLKDEV_IFL_WAIT)
+                        wait_for_completion(&wait);
+                if (bio_flagged(bio, BIO_EOPNOTSUPP))
+                        ret = -EOPNOTSUPP;
+                else if (!bio_flagged(bio, BIO_UPTODATE))
+                        ret = -EIO;
+                bio_put(bio);
+        }
+        return ret;
+out_free_page:
+        __free_page(page);
+out_free_bio:
+        bio_put(bio);
+out:
+        return -ENOMEM;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
+struct bio_batch
+{
+        atomic_t                done;
+        unsigned long           flags;
+        struct completion       *wait;
+        bio_end_io_t            *end_io;
+};
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+        struct bio_batch *bb = bio->bi_private;
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        set_bit(BIO_EOPNOTSUPP, &bb->flags);
+                else
+                        clear_bit(BIO_UPTODATE, &bb->flags);
+        }
+        if (bb) {
+                if (bb->end_io)
+                        bb->end_io(bio, err);
+                atomic_inc(&bb->done);
+                complete(bb->wait);
+        }
+        bio_put(bio);
+}
+/**
+ * blkdev_issue_zeroout generate number of zero filed write bios
+ * @bdev:       blockdev to issue
+ * @sector:     start sector
+ * @nr_sects:   number of sectors to write
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @flags:      BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *  Generate and issue number of bios with zerofiled pages.
+ *  Send barrier at the beginning and at the end if requested. This guarantie
+ *  correct request ordering. Empty barrier allow us to avoid post queue flush.
+ */
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+                        sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+        int ret = 0;
+        struct bio *bio;
+        struct bio_batch bb;
+        unsigned int sz, issued = 0;
+        DECLARE_COMPLETION_ONSTACK(wait);
+        atomic_set(&bb.done, 0);
+        bb.flags = 1 << BIO_UPTODATE;
+        bb.wait = &wait;
+        bb.end_io = NULL;
+        if (flags & BLKDEV_IFL_BARRIER) {
+                /* issue async barrier before the data */
+                ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
+                if (ret)
+                        return ret;
+        }
+submit:
+        while (nr_sects != 0) {
+                bio = bio_alloc(gfp_mask,
+                                min(nr_sects, (sector_t)BIO_MAX_PAGES));
+                if (!bio)
+                        break;
+                bio->bi_sector = sector;
+                bio->bi_bdev   = bdev;
+                bio->bi_end_io = bio_batch_end_io;
+                if (flags & BLKDEV_IFL_WAIT)
+                        bio->bi_private = &bb;
+                while (nr_sects != 0) {
+                        sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
+                        if (sz == 0)
+                                /* bio has maximum size possible */
+                                break;
+                        ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+                        nr_sects -= ret >> 9;
+                        sector += ret >> 9;
+                        if (ret < (sz << 9))
+                                break;
+                }
+                issued++;
+                submit_bio(WRITE, bio);
+        }
+        /*
+         * When all data bios are in flight. Send final barrier if requeted.
+         */
+        if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
+                ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
+                                        flags & BLKDEV_IFL_WAIT);
+        if (flags & BLKDEV_IFL_WAIT)
+                /* Wait for bios in-flight */
+                while ( issued != atomic_read(&bb.done))
+                        wait_for_completion(&wait);
+        if (!test_bit(BIO_UPTODATE, &bb.flags))
+                /* One of bios in the batch was completed with error.*/
+                ret = -EIO;
+        if (ret)
+                goto out;
+        if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
+                ret = -EOPNOTSUPP;
+                goto out;
+        }
+        if (nr_sects != 0)
+                goto submit;
+out:
+        return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 838834be115b..0f3eb70f9ce1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
 #define RQ_CIC(rq)              \
        ((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private3)
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -143,8 +144,6 @@ struct cfq_queue {
        struct cfq_queue *new_cfqq;
        struct cfq_group *cfqg;
        struct cfq_group *orig_cfqg;
-        /* Sectors dispatched in current dispatch round */
-        unsigned long nr_sectors;
 };
 /*
@@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
-#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
        blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
        if (!RB_EMPTY_NODE(&cfqg->rb_node))
                cfq_rb_erase(&cfqg->rb_node, st);
        cfqg->saved_workload_slice = 0;
-        blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
+        blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
                        slice_used = cfqq->allocated_slice;
        }
-        cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
+        cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
-                                cfqq->nr_sectors);
        return slice_used;
 }
@@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
                                        st->min_vdisktime);
-        blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
+        blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
-                                                cfqq->nr_sectors);
+        blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
        if (!cfqg)
                goto done;
-        cfqg->weight = blkcg->weight;
        for_each_cfqg_st(cfqg, i, j, st)
                *st = CFQ_RB_ROOT;
        RB_CLEAR_NODE(&cfqg->rb_node);
@@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
        sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
        blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
                                        MKDEV(major, minor));
+        cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
        /* Add group on cfqd list */
        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
        return cfqg;
 }
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+        atomic_inc(&cfqg->ref);
+        return cfqg;
+}
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
        /* Currently, all async queues are mapped to root group */
@@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
        return &cfqd->root_group;
 }
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+        return cfqg;
+}
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
        cfqq->cfqg = cfqg;
@@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
        elv_rb_del(&cfqq->sort_list, rq);
        cfqq->queued[rq_is_sync(rq)]--;
+        blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
+                                                rq_is_sync(rq));
        cfq_add_rq_rb(rq);
+        blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+                        &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+                        rq_is_sync(rq));
 }
 static struct request *
@@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq)
        cfq_del_rq_rb(rq);
        cfqq->cfqd->rq_queued--;
+        blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
+                                                rq_is_sync(rq));
        if (rq_is_meta(rq)) {
                WARN_ON(!cfqq->meta_pending);
                cfqq->meta_pending--;
@@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
        }
 }
+static void cfq_bio_merged(struct request_queue *q, struct request *req,
+                                struct bio *bio)
+{
+        blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
+                                        cfq_bio_sync(bio));
+}
 static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
                    struct request *next)
@@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
        if (cfqq->next_rq == next)
                cfqq->next_rq = rq;
        cfq_remove_request(next);
+        blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
+                                        rq_is_sync(next));
 }
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
        return cfqq == RQ_CFQQ(rq);
 }
+static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+        del_timer(&cfqd->idle_slice_timer);
+        blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+}
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
                                   struct cfq_queue *cfqq)
 {
        if (cfqq) {
                cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
                                cfqd->serving_prio, cfqd->serving_type);
+                blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
                cfqq->slice_start = 0;
                cfqq->dispatch_start = jiffies;
                cfqq->allocated_slice = 0;
                cfqq->slice_end = 0;
                cfqq->slice_dispatch = 0;
-                cfqq->nr_sectors = 0;
                cfq_clear_cfqq_wait_request(cfqq);
                cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
                cfq_clear_cfqq_fifo_expire(cfqq);
                cfq_mark_cfqq_slice_new(cfqq);
-                del_timer(&cfqd->idle_slice_timer);
+                cfq_del_timer(cfqd, cfqq);
        }
        cfqd->active_queue = cfqq;
@@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
        if (cfq_cfqq_wait_request(cfqq))
-                del_timer(&cfqd->idle_slice_timer);
+                cfq_del_timer(cfqd, cfqq);
        cfq_clear_cfqq_wait_request(cfqq);
        cfq_clear_cfqq_wait_busy(cfqq);
@@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
        sl = cfqd->cfq_slice_idle;
        mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+        blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
 }
@@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
        elv_dispatch_sort(q, rq);
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-        cfqq->nr_sectors += blk_rq_sectors(rq);
+        blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+                                        rq_data_dir(rq), rq_is_sync(rq));
 }
 /*
@@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                if (cfq_cfqq_wait_request(cfqq)) {
                        if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
                            cfqd->busy_queues > 1) {
-                                del_timer(&cfqd->idle_slice_timer);
+                                cfq_del_timer(cfqd, cfqq);
                                cfq_clear_cfqq_wait_request(cfqq);
                                __blk_run_queue(cfqd->queue);
-                        } else
+                        } else {
+                                blkiocg_update_idle_time_stats(
+                                                &cfqq->cfqg->blkg);
                                cfq_mark_cfqq_must_dispatch(cfqq);
+                        }
                }
        } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
                /*
@@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
        rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
        list_add_tail(&rq->queuelist, &cfqq->fifo);
        cfq_add_rq_rb(rq);
+        blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+                        &cfqd->serving_group->blkg, rq_data_dir(rq),
+                        rq_is_sync(rq));
        cfq_rq_enqueued(cfqd, cfqq, rq);
 }
@@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
        WARN_ON(!cfqq->dispatched);
        cfqd->rq_in_driver--;
        cfqq->dispatched--;
+        blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
+                        rq_io_start_time_ns(rq), rq_data_dir(rq),
+                        rq_is_sync(rq));
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
@@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq)
                rq->elevator_private = NULL;
                rq->elevator_private2 = NULL;
+                /* Put down rq reference on cfqg */
+                cfq_put_cfqg(RQ_CFQG(rq));
+                rq->elevator_private3 = NULL;
                cfq_put_queue(cfqq);
        }
 }
@@ -3528,6 +3574,7 @@ new_queue:
        rq->elevator_private = cic;
        rq->elevator_private2 = cfqq;
+        rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
        return 0;
 queue_fail:
@@ -3870,6 +3917,7 @@ static struct elevator_type iosched_cfq = {
                .elevator_merged_fn =           cfq_merged_request,
                .elevator_merge_req_fn =        cfq_merged_requests,
                .elevator_allow_merge_fn =      cfq_allow_merge,
+                .elevator_bio_merged_fn =       cfq_bio_merged,
                .elevator_dispatch_fn =         cfq_dispatch_requests,
                .elevator_add_req_fn =          cfq_insert_request,
                .elevator_activate_req_fn =     cfq_activate_request,
diff --git a/block/elevator.c b/block/elevator.c
index 76e3702d5381..5e734592bb40 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
        q->last_merge = rq;
 }
+void elv_bio_merged(struct request_queue *q, struct request *rq,
+                        struct bio *bio)
+{
+        struct elevator_queue *e = q->elevator;
+        if (e->ops->elevator_bio_merged_fn)
+                e->ops->elevator_bio_merged_fn(q, rq, bio);
+}
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
        /*
diff --git a/block/genhd.c b/block/genhd.c
index d13ba76a169c..154b5f80b3ab 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
        return disk;
 }
+EXPORT_SYMBOL(get_gendisk);
 /**
 * bdget_disk - do bdget() by gendisk and partition number
diff --git a/block/ioctl.c b/block/ioctl.c
index 8905d2a2a717..e8eb679f2f9b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
        if (start + len > (bdev->bd_inode->i_size >> 9))
                return -EINVAL;
        return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
-                                    DISCARD_FL_WAIT);
+                                    BLKDEV_IFL_WAIT);
 }
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e5e86a781820..d6f1ae342b1d 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2251,7 +2251,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
        if (test_bit(MD_NO_BARRIER, &mdev->flags))
                return;
-        r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+        r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        if (r) {
                set_bit(MD_NO_BARRIER, &mdev->flags);
                dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3f096e7959b4..c786023001d2 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -946,7 +946,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
        int rv;
        if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
-                rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+                rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
+                                        NULL, BLKDEV_IFL_WAIT);
                if (rv) {
                        dev_err(DEV, "local disk flush failed with status %d\n", rv);
                        /* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..55dcb7884f4d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -417,7 +417,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
         */
        mutex_unlock(&bd_inode->i_mutex);
-        error = blkdev_issue_flush(bdev, NULL);
+        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
        if (error == -EOPNOTSUPP)
                error = 0;
@@ -668,41 +668,209 @@ void bd_forget(struct inode *inode)
                iput(bdev->bd_inode);
 }
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+                         void *holder)
 {
-        int res;
-        spin_lock(&bdev_lock);
-        /* first decide result */
        if (bdev->bd_holder == holder)
-                res = 0;         /* already a holder */
+                return true;     /* already a holder */
        else if (bdev->bd_holder != NULL)
-                res = -EBUSY;    /* held by someone else */
+                return false;    /* held by someone else */
        else if (bdev->bd_contains == bdev)
-                res = 0;         /* is a whole device which isn't held */
+                return true;     /* is a whole device which isn't held */
-        else if (bdev->bd_contains->bd_holder == bd_claim)
+        else if (whole->bd_holder == bd_claim)
-                res = 0;         /* is a partition of a device that is being partitioned */
+                return true;     /* is a partition of a device that is being partitioned */
-        else if (bdev->bd_contains->bd_holder != NULL)
+        else if (whole->bd_holder != NULL)
-                res = -EBUSY;    /* is a partition of a held device */
+                return false;    /* is a partition of a held device */
        else
-                res = 0;         /* is a partition of an un-held device */
+                return true;     /* is a partition of an un-held device */
+}
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+                               struct block_device *whole, void *holder)
+{
+retry:
+        /* if someone else claimed, fail */
+        if (!bd_may_claim(bdev, whole, holder))
+                return -EBUSY;
+        /* if someone else is claiming, wait for it to finish */
+        if (whole->bd_claiming && whole->bd_claiming != holder) {
+                wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+                DEFINE_WAIT(wait);
+                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&bdev_lock);
+                schedule();
+                finish_wait(wq, &wait);
+                spin_lock(&bdev_lock);
+                goto retry;
+        }
+        /* yay, all mine */
+        return 0;
+}
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming().  If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+                                              void *holder)
+{
+        struct gendisk *disk;
+        struct block_device *whole;
+        int partno, err;
+        might_sleep();
+        /*
+         * @bdev might not have been initialized properly yet, look up
+         * and grab the outer block device the hard way.
+         */
+        disk = get_gendisk(bdev->bd_dev, &partno);
+        if (!disk)
+                return ERR_PTR(-ENXIO);
+        whole = bdget_disk(disk, 0);
+        put_disk(disk);
+        if (!whole)
+                return ERR_PTR(-ENOMEM);
+        /* prepare to claim, if successful, mark claiming in progress */
+        spin_lock(&bdev_lock);
+        err = bd_prepare_to_claim(bdev, whole, holder);
+        if (err == 0) {
+                whole->bd_claiming = holder;
+                spin_unlock(&bdev_lock);
+                return whole;
+        } else {
+                spin_unlock(&bdev_lock);
+                bdput(whole);
+                return ERR_PTR(err);
+        }
+}
-        /* now impose change */
+/* releases bdev_lock */
-        if (res==0) {
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        BUG_ON(whole->bd_claiming != holder);
+        whole->bd_claiming = NULL;
+        wake_up_bit(&whole->bd_claiming, 0);
+        spin_unlock(&bdev_lock);
+        bdput(whole);
+}
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        spin_lock(&bdev_lock);
+        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
+}
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully.  This
+ * function may be called with or without preceding
+ * blk_start_claiming().  In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+        struct block_device *whole = bdev->bd_contains;
+        int res;
+        might_sleep();
+        spin_lock(&bdev_lock);
+        res = bd_prepare_to_claim(bdev, whole, holder);
+        if (res == 0) {
                /* note that for a whole device bd_holders
                 * will be incremented twice, and bd_holder will
                 * be set to bd_claim before being set to holder
                 */
-                bdev->bd_contains->bd_holders ++;
+                whole->bd_holders++;
-                bdev->bd_contains->bd_holder = bd_claim;
+                whole->bd_holder = bd_claim;
                bdev->bd_holders++;
                bdev->bd_holder = holder;
        }
-        spin_unlock(&bdev_lock);
+        if (whole->bd_claiming)
+                __bd_abort_claiming(whole, holder);     /* releases bdev_lock */
+        else
+                spin_unlock(&bdev_lock);
        return res;
 }
 EXPORT_SYMBOL(bd_claim);
 void bd_release(struct block_device *bdev)
@@ -1316,6 +1484,7 @@ EXPORT_SYMBOL(blkdev_get);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+        struct block_device *whole = NULL;
        struct block_device *bdev;
        int res;
@@ -1338,22 +1507,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
+        if (filp->f_mode & FMODE_EXCL) {
+                whole = bd_start_claiming(bdev, filp);
+                if (IS_ERR(whole)) {
+                        bdput(bdev);
+                        return PTR_ERR(whole);
+                }
+        }
        filp->f_mapping = bdev->bd_inode->i_mapping;
        res = blkdev_get(bdev, filp->f_mode);
-        if (res)
-                return res;
-        if (filp->f_mode & FMODE_EXCL) {
+        if (whole) {
-                res = bd_claim(bdev, filp);
+                if (res == 0)
-                if (res)
+                        BUG_ON(bd_claim(bdev, filp) != 0);
-                        goto out_blkdev_put;
+                else
+                        bd_abort_claiming(whole, filp);
        }
-        return 0;
- out_blkdev_put:
-        blkdev_put(bdev, filp->f_mode);
        return res;
 }
@@ -1564,27 +1736,34 @@ EXPORT_SYMBOL(lookup_bdev);
 */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-        struct block_device *bdev;
+        struct block_device *bdev, *whole;
-        int error = 0;
+        int error;
        bdev = lookup_bdev(path);
        if (IS_ERR(bdev))
                return bdev;
+        whole = bd_start_claiming(bdev, holder);
+        if (IS_ERR(whole)) {
+                bdput(bdev);
+                return whole;
+        }
        error = blkdev_get(bdev, mode);
        if (error)
-                return ERR_PTR(error);
+                goto out_abort_claiming;
        error = -EACCES;
        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-                goto blkdev_put;
+                goto out_blkdev_put;
-        error = bd_claim(bdev, holder);
-        if (error)
-                goto blkdev_put;
+        BUG_ON(bd_claim(bdev, holder) != 0);
        return bdev;
-        
-blkdev_put:
+out_blkdev_put:
        blkdev_put(bdev, mode);
+out_abort_claiming:
+        bd_abort_claiming(whole, holder);
        return ERR_PTR(error);
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-                             DISCARD_FL_BARRIER);
+                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..9492f6003ef9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -91,7 +91,8 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
         * storage
         */
        if (test_opt(inode->i_sb, BARRIER))
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                                BLKDEV_IFL_WAIT);
 out:
        return ret;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                if (ext4_should_writeback_data(inode) &&
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
-                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+                                        NULL, BLKDEV_IFL_WAIT);
                jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..bf011dc63471 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                            DISCARD_FL_BARRIER);
+                                                            BLKDEV_IFL_WAIT |
+                                                            BLKDEV_IFL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
        }
        if (nr_sects) {
                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-                                         DISCARD_FL_BARRIER);
+                                         BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (rv)
                        goto fail;
        }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
                if (err)
                        __jbd2_journal_abort_hard(journal);
                if (journal->j_flags & JBD2_BARRIER)
-                        blkdev_issue_flush(journal->j_dev, NULL);
+                        blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+                                BLKDEV_IFL_WAIT);
        }
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..7ffcf2b8b1f4 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -670,7 +670,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
                                                   GFP_NOFS,
-                                                   DISCARD_FL_BARRIER);
+                                                   BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
@@ -680,7 +680,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                           GFP_NOFS, DISCARD_FL_BARRIER);
+                                           GFP_NOFS, BLKDEV_IFL_BARRIER);
        return ret;
 }
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
        barrier_done = reiserfs_commit_for_inode(inode);
        reiserfs_write_unlock(inode->i_sb);
        if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+                        BLKDEV_IFL_WAIT);
        if (barrier_done < 0)
                return barrier_done;
        return (err < 0) ? -EIO : 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 52e06b487ced..2b177c778ba7 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -725,7 +725,8 @@ void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
 {
-        blkdev_issue_flush(buftarg->bt_bdev, NULL);
+        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
 }
 STATIC void
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index bd0e3c6f323f..7534979d83bd 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/timer.h>
 #include <linux/writeback.h>
 #include <asm/atomic.h>
@@ -88,6 +89,8 @@ struct backing_dev_info {
        struct device *dev;
+        struct timer_list laptop_mode_wb_timer;
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
        struct dentry *debug_stats;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8bae7bb..3ac2bd2fc485 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -186,15 +186,19 @@ struct request {
        };
        /*
-         * two pointers are available for the IO schedulers, if they need
+         * Three pointers are available for the IO schedulers, if they need
         * more they have to dynamically allocate it.
         */
        void *elevator_private;
        void *elevator_private2;
+        void *elevator_private3;
        struct gendisk *rq_disk;
        unsigned long start_time;
+#ifdef CONFIG_BLK_CGROUP
+        unsigned long long start_time_ns;
+        unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
        /* Number of scatter-gather DMA addr+len pairs after
         * physical address coalescing is performed.
         */
@@ -994,20 +998,25 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
                return NULL;
        return bqt->tag_index[tag];
 }
+enum{
-extern int blkdev_issue_flush(struct block_device *, sector_t *);
+        BLKDEV_WAIT,    /* wait for completion */
-#define DISCARD_FL_WAIT         0x01    /* wait for completion */
+        BLKDEV_BARRIER, /*issue request with barrier */
-#define DISCARD_FL_BARRIER      0x02    /* issue DISCARD_BARRIER request */
+};
-extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+#define BLKDEV_IFL_WAIT         (1 << BLKDEV_WAIT)
-                sector_t nr_sects, gfp_t, int flags);
+#define BLKDEV_IFL_BARRIER      (1 << BLKDEV_BARRIER)
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
+                        unsigned long);
+extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+                sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+                        sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 static inline int sb_issue_discard(struct super_block *sb,
                                   sector_t block, sector_t nr_blocks)
 {
        block <<= (sb->s_blocksize_bits - 9);
        nr_blocks <<= (sb->s_blocksize_bits - 9);
        return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
-                                    DISCARD_FL_BARRIER);
+                                   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@ -1196,6 +1205,39 @@ static inline void put_dev_sector(Sector p)
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+#ifdef CONFIG_BLK_CGROUP
+static inline void set_start_time_ns(struct request *req)
+{
+        req->start_time_ns = sched_clock();
+}
+static inline void set_io_start_time_ns(struct request *req)
+{
+        req->io_start_time_ns = sched_clock();
+}
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+        return req->start_time_ns;
+}
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+        return req->io_start_time_ns;
+}
+#else
+static inline void set_start_time_ns(struct request *req) {}
+static inline void set_io_start_time_ns(struct request *req) {}
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+        return 0;
+}
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+        return 0;
+}
+#endif
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
        MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1cb3372e65d8..2c958f4fce1e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int
 typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
+typedef void (elevator_bio_merged_fn) (struct request_queue *,
+                                                struct request *, struct bio *);
 typedef int (elevator_dispatch_fn) (struct request_queue *, int);
 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
@@ -36,6 +39,7 @@ struct elevator_ops
        elevator_merged_fn *elevator_merged_fn;
        elevator_merge_req_fn *elevator_merge_req_fn;
        elevator_allow_merge_fn *elevator_allow_merge_fn;
+        elevator_bio_merged_fn *elevator_bio_merged_fn;
        elevator_dispatch_fn *elevator_dispatch_fn;
        elevator_add_req_fn *elevator_add_req_fn;
@@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
                               struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *, int);
+extern void elv_bio_merged(struct request_queue *q, struct request *,
+                                struct bio *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern int elv_queue_empty(struct request_queue *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44f35aea2f1f..f30970c97acf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -651,6 +651,7 @@ struct block_device {
        int                     bd_openers;
        struct mutex            bd_mutex;       /* open/close mutex */
        struct list_head        bd_inodes;
+        void *                  bd_claiming;
        void *                  bd_holder;
        int                     bd_holders;
 #ifdef CONFIG_SYSFS
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 36520ded3e06..eb38a2c645f6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -96,8 +96,10 @@ static inline void inode_sync_wait(struct inode *inode)
 /*
 * mm/page-writeback.c
 */
-void laptop_io_completion(void);
+void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
+void laptop_mode_sync(struct work_struct *work);
+void laptop_mode_timer_fn(unsigned long data);
 void throttle_vm_writeout(gfp_t gfp_mask);
 /* These are exported to sysctl. */
diff --git a/init/Kconfig b/init/Kconfig
index eb77e8ccde1c..087c14f3c595 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -612,6 +612,33 @@ config RT_GROUP_SCHED
 endif #CGROUP_SCHED
+config BLK_CGROUP
+        tristate "Block IO controller"
+        depends on CGROUPS && BLOCK
+        default n
+        ---help---
+        Generic block IO controller cgroup interface. This is the common
+        cgroup interface which should be used by various IO controlling
+        policies.
+        Currently, CFQ IO scheduler uses it to recognize task groups and
+        control disk bandwidth allocation (proportional time slice allocation)
+        to such task groups.
+        This option only enables generic Block IO controller infrastructure.
+        One needs to also enable actual IO controlling logic in CFQ for it
+        to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+        See Documentation/cgroups/blkio-controller.txt for more information.
+config DEBUG_BLK_CGROUP
+        bool "Enable Block IO controller debugging"
+        depends on BLK_CGROUP
+        default n
+        ---help---
+        Enable some debugging help. Currently it exports additional stat
+        files in a cgroup which can be useful for debugging.
 endif # CGROUPS
 config MM_OWNER
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
        return (unsigned long long)(jiffies - INITIAL_JIFFIES)
                                        * (NSEC_PER_SEC / HZ);
 }
+EXPORT_SYMBOL_GPL(sched_clock);
 static __read_mostly int sched_clock_running;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8b..d0f2b3765f8d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        }
 }
-static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 /*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
@@ -697,21 +693,19 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        return 0;
 }
-static void do_laptop_sync(struct work_struct *work)
+void laptop_mode_timer_fn(unsigned long data)
 {
-        wakeup_flusher_threads(0);
+        struct request_queue *q = (struct request_queue *)data;
-        kfree(work);
+        int nr_pages = global_page_state(NR_FILE_DIRTY) +
-}
+                global_page_state(NR_UNSTABLE_NFS);
-static void laptop_timer_fn(unsigned long unused)
+        /*
-{
+         * We want to write everything out, not just down to the dirty
-        struct work_struct *work;
+         * threshold
+         */
-        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (bdi_has_dirty_io(&q->backing_dev_info))
-        if (work) {
+                bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
-                INIT_WORK(work, do_laptop_sync);
-                schedule_work(work);
-        }
 }
 /*
@@ -719,9 +713,9 @@ static void laptop_timer_fn(unsigned long unused)
 * of all dirty data a few seconds from now.  If the flush is already scheduled
 * then push it back - the user is still using the disk.
 */
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
 {
-        mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+        mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 /*
@@ -731,7 +725,14 @@ void laptop_io_completion(void)
 */
 void laptop_sync_completion(void)
 {
-        del_timer(&laptop_mode_wb_timer);
+        struct backing_dev_info *bdi;
+        rcu_read_lock();
+        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+                del_timer(&bdi->laptop_mode_wb_timer);
+        rcu_read_unlock();
 }
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc7..eb086e0f4dcc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+                                nr_blocks, GFP_KERNEL,
+                                BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (err)
                        return err;
                cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+                                nr_blocks, GFP_KERNEL,
+                                BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (err)
                        break;
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
                        start_block <<= PAGE_SHIFT - 9;
                        nr_blocks <<= PAGE_SHIFT - 9;
                        if (blkdev_issue_discard(si->bdev, start_block,
-                                    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+                                    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
+                                                        BLKDEV_IFL_BARRIER))
                                break;
                }