17 files changed, 378 insertions, 123 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index d111e3b23db..d18ecd827c4 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -3,15 +3,21 @@
 biodoc.txt
        - Notes on the Generic Block Layer Rewrite in Linux 2.5
 capability.txt
-        - Generic Block Device Capability (/sys/block/<disk>/capability)
+        - Generic Block Device Capability (/sys/block/<device>/capability)
+cfq-iosched.txt
+        - CFQ IO scheduler tunables
+data-integrity.txt
+        - Block data integrity
 deadline-iosched.txt
        - Deadline IO scheduler tunables
 ioprio.txt
        - Block io priorities (in CFQ scheduler)
+queue-sysfs.txt
+        - Queue's sysfs entries
 request.txt
        - The members of struct request (in include/linux/blkdev.h)
 stat.txt
-        - Block layer statistics in /sys/block/<dev>/stat
+        - Block layer statistics in /sys/block/<device>/stat
 switching-sched.txt
        - Switching I/O schedulers at runtime
 writeback_cache_control.txt
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index 6d670f57045..d89b4fe724d 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -1,3 +1,14 @@
+CFQ (Complete Fairness Queueing)
+===============================
+The main aim of CFQ scheduler is to provide a fair allocation of the disk
+I/O bandwidth for all the processes which requests an I/O operation.
+CFQ maintains the per process queue for the processes which request I/O
+operation(syncronous requests). In case of asynchronous requests, all the
+requests from all the processes are batched together according to their
+process's I/O priority.
 CFQ ioscheduler tunables
 ========================
@@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID
 controller or for storage arrays), setting slice_idle=0 might end up in better
 throughput and acceptable latencies.
+back_seek_max
+-------------
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+back_seek_penalty
+-----------------
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+fifo_expire_async
+-----------------
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+fifo_expire_sync
+----------------
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+slice_async
+-----------
+This parameter is same as of slice_sync but for asynchronous queue. The
+default value is 40ms.
+slice_async_rq
+--------------
+This parameter is used to limit the dispatching of asynchronous request to
+device request queue in queue's slice time. The maximum number of request that
+are allowed to be dispatched also depends upon the io priority. Default value
+for this is 2.
+slice_sync
+----------
+When a queue is selected for execution, the queues IO requests are only
+executed for a certain amount of time(time_slice) before switching to another
+queue. This parameter is used to calculate the time slice of synchronous
+queue.
+time_slice is computed using the below equation:-
+time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
+time_slice of synchronous queue, increase the value of slice_sync. Default
+value is 100ms.
+quantum
+-------
+This specifies the number of request dispatched to the device queue. In a
+queue's time slice, a request will not be dispatched if the number of request
+in the device exceeds this parameter. This parameter is used for synchronous
+request.
+In case of storage with several disk, this setting can limit the parallel
+processing of request. Therefore, increasing the value can imporve the
+performace although this can cause the latency of some I/O to increase due
+to more number of requests.
 CFQ IOPS Mode for group scheduling
 ===================================
 Basic CFQ design is to provide priority based time slices. Higher priority
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 6518a55273e..e54ac1d5340 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory.
 Files denoted with a RO postfix are readonly and the RW postfix means
 read-write.
+add_random (RW)
+----------------
+This file allows to trun off the disk entropy contribution. Default
+value of this file is '1'(on).
+discard_granularity (RO)
+-----------------------
+This shows the size of internal allocation of the device in bytes, if
+reported by the device. A value of '0' means device does not support
+the discard functionality.
+discard_max_bytes (RO)
+----------------------
+Devices that support discard functionality may have internal limits on
+the number of bytes that can be trimmed or unmapped in a single operation.
+The discard_max_bytes parameter is set by the device driver to the maximum
+number of bytes that can be discarded in a single operation. Discard
+requests issued to the device must not exceed this limit. A discard_max_bytes
+value of 0 means that the device does not support discard functionality.
+discard_zeroes_data (RO)
+------------------------
+When read, this file will show if the discarded block are zeroed by the
+device or not. If its value is '1' the blocks are zeroed otherwise not.
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
+iostats (RW)
+-------------
+This file is used to control (on/off) the iostats accounting of the
+disk.
+logical_block_size (RO)
+-----------------------
+This is the logcal block size of the device, in bytes.
 max_hw_sectors_kb (RO)
 ----------------------
 This is the maximum number of kilobytes supported in a single data transfer.
+max_integrity_segments (RO)
+---------------------------
+When read, this file shows the max limit of integrity segments as
+set by block layer which a hardware controller can handle.
 max_sectors_kb (RW)
 -------------------
 This is the maximum number of kilobytes that the block layer will allow
 for a filesystem request. Must be smaller than or equal to the maximum
 size allowed by the hardware.
+max_segments (RO)
+-----------------
+Maximum number of segments of the device.
+max_segment_size (RO)
+---------------------
+Maximum segment size of the device.
+minimum_io_size (RO)
+--------------------
+This is the smallest preferred io size reported by the device.
 nomerges (RW)
 -------------
 This enables the user to disable the lookup logic involved with IO
@@ -45,11 +96,24 @@ per-block-cgroup request pool.  IOW, if there are N block cgroups,
 each request queue may have upto N request pools, each independently
 regulated by nr_requests.
+optimal_io_size (RO)
+--------------------
+This is the optimal io size reported by the device.
+physical_block_size (RO)
+------------------------
+This is the physical block size of device, in bytes.
 read_ahead_kb (RW)
 ------------------
 Maximum number of kilobytes to read-ahead for filesystems on this block
 device.
+rotational (RW)
+---------------
+This file is used to stat if the device is of rotational type or
+non-rotational type.
 rq_affinity (RW)
 ----------------
 If this option is '1', the block layer will migrate request completions to the
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2b461b496a7..19cc761cacb 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        struct request_queue *q = bdev_get_queue(bdev);
        int type = REQ_WRITE | REQ_DISCARD;
        unsigned int max_discard_sectors;
+        unsigned int granularity, alignment, mask;
        struct bio_batch bb;
        struct bio *bio;
        int ret = 0;
@@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        if (!blk_queue_discard(q))
                return -EOPNOTSUPP;
+        /* Zero-sector (unknown) and one-sector granularities are the same.  */
+        granularity = max(q->limits.discard_granularity >> 9, 1U);
+        mask = granularity - 1;
+        alignment = (bdev_discard_alignment(bdev) >> 9) & mask;
        /*
         * Ensure that max_discard_sectors is of the proper
-         * granularity
+         * granularity, so that requests stay aligned after a split.
         */
        max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+        max_discard_sectors = round_down(max_discard_sectors, granularity);
        if (unlikely(!max_discard_sectors)) {
                /* Avoid infinite loop below. Being cautious never hurts. */
                return -EOPNOTSUPP;
-        } else if (q->limits.discard_granularity) {
-                unsigned int disc_sects = q->limits.discard_granularity >> 9;
-                max_discard_sectors &= ~(disc_sects - 1);
        }
        if (flags & BLKDEV_DISCARD_SECURE) {
@@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
        bb.wait = &wait;
        while (nr_sects) {
+                unsigned int req_sects;
+                sector_t end_sect;
                bio = bio_alloc(gfp_mask, 1);
                if (!bio) {
                        ret = -ENOMEM;
                        break;
                }
+                req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+                /*
+                 * If splitting a request, and the next starting sector would be
+                 * misaligned, stop the discard at the previous aligned sector.
+                 */
+                end_sect = sector + req_sects;
+                if (req_sects < nr_sects && (end_sect & mask) != alignment) {
+                        end_sect =
+                                round_down(end_sect - alignment, granularity)
+                                + alignment;
+                        req_sects = end_sect - sector;
+                }
                bio->bi_sector = sector;
                bio->bi_end_io = bio_batch_end_io;
                bio->bi_bdev = bdev;
                bio->bi_private = &bb;
-                if (nr_sects > max_discard_sectors) {
+                bio->bi_size = req_sects << 9;
-                        bio->bi_size = max_discard_sectors << 9;
+                nr_sects -= req_sects;
-                        nr_sects -= max_discard_sectors;
+                sector = end_sect;
-                        sector += max_discard_sectors;
-                } else {
-                        bio->bi_size = nr_sects << 9;
-                        nr_sects = 0;
-                }
                atomic_inc(&bb.done);
                submit_bio(type, bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 160035f5488..e76279e4116 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
        return 0;
 }
+static void
+__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
+                     struct scatterlist *sglist, struct bio_vec **bvprv,
+                     struct scatterlist **sg, int *nsegs, int *cluster)
+{
+        int nbytes = bvec->bv_len;
+        if (*bvprv && *cluster) {
+                if ((*sg)->length + nbytes > queue_max_segment_size(q))
+                        goto new_segment;
+                if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
+                        goto new_segment;
+                if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
+                        goto new_segment;
+                (*sg)->length += nbytes;
+        } else {
+new_segment:
+                if (!*sg)
+                        *sg = sglist;
+                else {
+                        /*
+                         * If the driver previously mapped a shorter
+                         * list, we could see a termination bit
+                         * prematurely unless it fully inits the sg
+                         * table on each mapping. We KNOW that there
+                         * must be more entries here or the driver
+                         * would be buggy, so force clear the
+                         * termination bit to avoid doing a full
+                         * sg_init_table() in drivers for each command.
+                         */
+                        (*sg)->page_link &= ~0x02;
+                        *sg = sg_next(*sg);
+                }
+                sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
+                (*nsegs)++;
+        }
+        *bvprv = bvec;
+}
 /*
 * map a request to scatterlist, return number of sg entries setup. Caller
 * must make sure sg can hold rq->nr_phys_segments entries
@@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
        bvprv = NULL;
        sg = NULL;
        rq_for_each_segment(bvec, rq, iter) {
-                int nbytes = bvec->bv_len;
+                __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+                                     &nsegs, &cluster);
-                if (bvprv && cluster) {
-                        if (sg->length + nbytes > queue_max_segment_size(q))
-                                goto new_segment;
-                        if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
-                                goto new_segment;
-                        if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
-                                goto new_segment;
-                        sg->length += nbytes;
-                } else {
-new_segment:
-                        if (!sg)
-                                sg = sglist;
-                        else {
-                                /*
-                                 * If the driver previously mapped a shorter
-                                 * list, we could see a termination bit
-                                 * prematurely unless it fully inits the sg
-                                 * table on each mapping. We KNOW that there
-                                 * must be more entries here or the driver
-                                 * would be buggy, so force clear the
-                                 * termination bit to avoid doing a full
-                                 * sg_init_table() in drivers for each command.
-                                 */
-                                sg->page_link &= ~0x02;
-                                sg = sg_next(sg);
-                        }
-                        sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
-                        nsegs++;
-                }
-                bvprv = bvec;
        } /* segments in rq */
@@ -199,6 +209,43 @@ new_segment:
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
+/**
+ * blk_bio_map_sg - map a bio to a scatterlist
+ * @q: request_queue in question
+ * @bio: bio being mapped
+ * @sglist: scatterlist being mapped
+ *
+ * Note:
+ *    Caller must make sure sg can hold bio->bi_phys_segments entries
+ *
+ * Will return the number of sg entries setup
+ */
+int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+                   struct scatterlist *sglist)
+{
+        struct bio_vec *bvec, *bvprv;
+        struct scatterlist *sg;
+        int nsegs, cluster;
+        unsigned long i;
+        nsegs = 0;
+        cluster = blk_queue_cluster(q);
+        bvprv = NULL;
+        sg = NULL;
+        bio_for_each_segment(bvec, bio, i) {
+                __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+                                     &nsegs, &cluster);
+        } /* segments in bio */
+        if (sg)
+                sg_mark_end(sg);
+        BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
+        return nsegs;
+}
+EXPORT_SYMBOL(blk_bio_map_sg);
 static inline int ll_new_hw_segment(struct request_queue *q,
                                    struct request *req,
                                    struct bio *bio)
diff --git a/block/genhd.c b/block/genhd.c
index cac7366957c..d839723303c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
 static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 {
-        static void *p;
+        void *p;
        p = disk_seqf_start(seqf, pos);
        if (!IS_ERR_OR_NULL(p) && !*pos)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index ba91b408aba..d8456649674 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -889,6 +889,7 @@ struct bm_aio_ctx {
        unsigned int done;
        unsigned flags;
 #define BM_AIO_COPY_PAGES       1
+#define BM_WRITE_ALL_PAGES      2
        int error;
        struct kref kref;
 };
@@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
                if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
                        break;
                if (rw & WRITE) {
-                        if (bm_test_page_unchanged(b->bm_pages[i])) {
+                        if (!(flags & BM_WRITE_ALL_PAGES) &&
+                            bm_test_page_unchanged(b->bm_pages[i])) {
                                dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
                                continue;
                        }
@@ -1141,6 +1143,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 }
 /**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @mdev:       DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
+{
+        return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
+}
+/**
 * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
 * @mdev:       DRBD device.
 * @upper_idx:  0: write all changed pages; +ve: page index to stop scanning for changed pages
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b2ca143d005..b953cc7c9c0 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1469,6 +1469,7 @@ extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
 extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
 extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
                unsigned long al_enr);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index dbe6135a2ab..f93a0320e95 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void _tl_clear(struct drbd_conf *mdev);
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
              "Lars Ellenberg <lars@linbit.com>");
@@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
        /* Actions operating on the disk state, also want to work on
           requests that got barrier acked. */
-        switch (what) {
-        case fail_frozen_disk_io:
-        case restart_frozen_disk_io:
-                list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-                        req = list_entry(le, struct drbd_request, tl_requests);
-                        _req_mod(req, what);
-                }
-        case connection_lost_while_pending:
+        list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-        case resend:
+                req = list_entry(le, struct drbd_request, tl_requests);
-                break;
+                _req_mod(req, what);
-        default:
-                dev_err(DEV, "what = %d in _tl_restart()\n", what);
        }
 }
@@ -459,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 */
 void tl_clear(struct drbd_conf *mdev)
 {
+        spin_lock_irq(&mdev->req_lock);
+        _tl_clear(mdev);
+        spin_unlock_irq(&mdev->req_lock);
+}
+static void _tl_clear(struct drbd_conf *mdev)
+{
        struct list_head *le, *tle;
        struct drbd_request *r;
-        spin_lock_irq(&mdev->req_lock);
        _tl_restart(mdev, connection_lost_while_pending);
        /* we expect this list to be empty. */
@@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
        memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
-        spin_unlock_irq(&mdev->req_lock);
 }
 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
@@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
        if (ns.susp_fen) {
                /* case1: The outdate peer handler is successful: */
                if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
-                        tl_clear(mdev);
                        if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
                                drbd_uuid_new_current(mdev);
                                clear_bit(NEW_CUR_UUID, &mdev->flags);
                        }
                        spin_lock_irq(&mdev->req_lock);
+                        _tl_clear(mdev);
                        _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
                        spin_unlock_irq(&mdev->req_lock);
                }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fb9dce8daa2..edb490aad8b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
                         la_size_changed && md_moved ? "size changed and md moved" :
                         la_size_changed ? "size changed" : "md moved");
                /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
-                err = drbd_bitmap_io(mdev, &drbd_bm_write,
+                err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
-                                "size changed", BM_LOCKED_MASK);
+                                     "size changed", BM_LOCKED_MASK);
                if (err) {
                        rv = dev_size_error;
                        goto out;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 910335c3092..01b2ac641c7 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                break;
        case resend:
+                /* Simply complete (local only) READs. */
+                if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+                        _req_may_be_done(req, m);
+                        break;
+                }
                /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
                   before the connection loss (B&C only); only P_BARRIER_ACK was missing.
                   Trowing them out of the TL here by pretending we got a BARRIER_ACK
@@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
                req->private_bio = NULL;
        }
        if (rw == WRITE) {
-                remote = 1;
+                /* Need to replicate writes.  Unless it is an empty flush,
+                 * which is better mapped to a DRBD P_BARRIER packet,
+                 * also for drbd wire protocol compatibility reasons. */
+                if (unlikely(size == 0)) {
+                        /* The only size==0 bios we expect are empty flushes. */
+                        D_ASSERT(bio->bi_rw & REQ_FLUSH);
+                        remote = 0;
+                } else
+                        remote = 1;
        } else {
                /* READ || READA */
                if (local) {
@@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
         * extent.  This waits for any resync activity in the corresponding
         * resync extent to finish, and, if necessary, pulls in the target
         * extent into the activity log, which involves further disk io because
-         * of transactional on-disk meta data updates. */
+         * of transactional on-disk meta data updates.
-        if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+         * Empty flushes don't need to go into the activity log, they can only
+         * flush data for pending writes which are already in there. */
+        if (rw == WRITE && local && size
+        && !test_bit(AL_SUSPENDED, &mdev->flags)) {
                req->rq_state |= RQ_IN_ACT_LOG;
                drbd_al_begin_io(mdev, sector);
        }
@@ -994,7 +1011,10 @@ allocate_barrier:
        if (rw == WRITE && _req_conflicts(req))
                goto fail_conflicting;
-        list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+        /* no point in adding empty flushes to the transfer log,
+         * they are mapped to drbd barriers already. */
+        if (likely(size!=0))
+                list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
        /* NOTE remote first: to get the concurrent write detection right,
         * we must register the request before start of local IO.  */
@@ -1014,6 +1034,14 @@ allocate_barrier:
            mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
                maybe_pull_ahead(mdev);
+        /* If this was a flush, queue a drbd barrier/start a new epoch.
+         * Unless the current epoch was empty anyways, or we are not currently
+         * replicating, in which case there is no point. */
+        if (unlikely(bio->bi_rw & REQ_FLUSH)
+                && mdev->newest_tle->n_writes
+                && drbd_should_do_remote(mdev->state))
+                queue_barrier(mdev);
        spin_unlock_irq(&mdev->req_lock);
        kfree(b); /* if someone else has beaten us to it... */
diff --git a/fs/bio.c b/fs/bio.c
index 5eaa70c9d96..71072ab9912 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 {
        unsigned int sz = sizeof(struct bio) + extra_size;
        struct kmem_cache *slab = NULL;
-        struct bio_slab *bslab;
+        struct bio_slab *bslab, *new_bio_slabs;
        unsigned int i, entry = -1;
        mutex_lock(&bio_slab_lock);
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        if (bio_slab_nr == bio_slab_max && entry == -1) {
                bio_slab_max <<= 1;
-                bio_slabs = krealloc(bio_slabs,
+                new_bio_slabs = krealloc(bio_slabs,
-                                     bio_slab_max * sizeof(struct bio_slab),
+                                         bio_slab_max * sizeof(struct bio_slab),
-                                     GFP_KERNEL);
+                                         GFP_KERNEL);
-                if (!bio_slabs)
+                if (!new_bio_slabs)
                        goto out_unlock;
+                bio_slabs = new_bio_slabs;
        }
        if (entry == -1)
                entry = bio_slab_nr++;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e519195d45..38e721b35d4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                         unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
+        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
+        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
+        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9f6d2e41281..58e2e7b7737 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 /*
 * Initialise the state of a blockdev page's buffers.
 */ 
-static void
+static sector_t
 init_page_buffers(struct page *page, struct block_device *bdev,
                        sector_t block, int size)
 {
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
                block++;
                bh = bh->b_this_page;
        } while (bh != head);
+        /*
+         * Caller needs to validate requested block against end of device.
+         */
+        return end_block;
 }
 /*
 * Create the page-cache page that contains the requested block.
 *
- * This is user purely for blockdev mappings.
+ * This is used purely for blockdev mappings.
 */
-static struct page *
+static int
 grow_dev_page(struct block_device *bdev, sector_t block,
-                pgoff_t index, int size)
+                pgoff_t index, int size, int sizebits)
 {
        struct inode *inode = bdev->bd_inode;
        struct page *page;
        struct buffer_head *bh;
+        sector_t end_block;
+        int ret = 0;            /* Will call free_more_memory() */
        page = find_or_create_page(inode->i_mapping, index,
                (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
        if (!page)
-                return NULL;
+                return ret;
        BUG_ON(!PageLocked(page));
        if (page_has_buffers(page)) {
                bh = page_buffers(page);
                if (bh->b_size == size) {
-                        init_page_buffers(page, bdev, block, size);
+                        end_block = init_page_buffers(page, bdev,
-                        return page;
+                                                index << sizebits, size);
+                        goto done;
                }
                if (!try_to_free_buffers(page))
                        goto failed;
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
         */
        spin_lock(&inode->i_mapping->private_lock);
        link_dev_buffers(page, bh);
-        init_page_buffers(page, bdev, block, size);
+        end_block = init_page_buffers(page, bdev, index << sizebits, size);
        spin_unlock(&inode->i_mapping->private_lock);
-        return page;
+done:
+        ret = (block < end_block) ? 1 : -ENXIO;
 failed:
        unlock_page(page);
        page_cache_release(page);
-        return NULL;
+        return ret;
 }
 /*
@@ -999,7 +1007,6 @@ failed:
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
 {
-        struct page *page;
        pgoff_t index;
        int sizebits;
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
                        bdevname(bdev, b));
                return -EIO;
        }
-        block = index << sizebits;
        /* Create a page with the proper size buffers.. */
-        page = grow_dev_page(bdev, block, index, size);
+        return grow_dev_page(bdev, block, index, size, sizebits);
-        if (!page)
-                return 0;
-        unlock_page(page);
-        page_cache_release(page);
-        return 1;
 }
 static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
-        int ret;
-        struct buffer_head *bh;
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
                return NULL;
        }
-retry:
+        for (;;) {
-        bh = __find_get_block(bdev, block, size);
+                struct buffer_head *bh;
-        if (bh)
+                int ret;
-                return bh;
-        ret = grow_buffers(bdev, block, size);
-        if (ret == 0) {
-                free_more_memory();
-                goto retry;
-        } else if (ret > 0) {
                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;
+                ret = grow_buffers(bdev, block, size);
+                if (ret < 0)
+                        return NULL;
+                if (ret == 0)
+                        free_more_memory();
        }
-        return NULL;
 }
 /*
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
 * which corresponds to the passed block_device, block and size. The
 * returned buffer has its reference count incremented.
 *
- * __getblk() cannot fail - it just keeps trying.  If you pass it an
- * illegal block number, __getblk() will happily return a buffer_head
- * which represents the non-existent block.  Very weird.
- *
 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
 * attempt is failing.  FIXME, perhaps?
 */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1faf4cb56f3..f86c720dba0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        unsigned long user_addr;
        size_t bytes;
        struct buffer_head map_bh = { 0, };
+        struct blk_plug plug;
        if (rw & WRITE)
                rw = WRITE_ODIRECT;
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                                PAGE_SIZE - user_addr / PAGE_SIZE);
        }
+        blk_start_plug(&plug);
        for (seg = 0; seg < nr_segs; seg++) {
                user_addr = (unsigned long)iov[seg].iov_base;
                sdio.size += bytes = iov[seg].iov_len;
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (sdio.bio)
                dio_bio_submit(dio, &sdio);
+        blk_finish_plug(&plug);
        /*
         * It is possible that, we return short IO due to end of file.
         * In that case, we need to release all the pages we got hold on.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4e72a9d4823..4a2ab7c8539 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
 * it already be started by driver.
 */
 #define RQ_NOMERGE_FLAGS        \
-        (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
+        (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD)
 #define rq_mergeable(rq)        \
        (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
         (((rq)->cmd_flags & REQ_DISCARD) || \
@@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
+extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+                          struct scatterlist *sglist);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
@@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
                & (lim->discard_granularity - 1);
 }
+static inline int bdev_discard_alignment(struct block_device *bdev)
+{
+        struct request_queue *q = bdev_get_queue(bdev);
+        if (bdev != bdev->bd_contains)
+                return bdev->bd_part->discard_alignment;
+        return q->limits.discard_alignment;
+}
 static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
 {
        if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
diff --git a/mm/filemap.c b/mm/filemap.c
index fa5ca304148..384344575c3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
-                                struct blk_plug plug;
-                                blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
-                                blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
-        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
@@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
-        blk_finish_plug(&plug);
        sb_end_write(inode->i_sb);
        return ret;
 }