aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-24 13:16:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-24 13:16:26 -0400
commit6c5103890057b1bb781b26b7aae38d33e4c517d8 (patch)
treee6e57961dcddcb5841acb34956e70b9dc696a880
parent3dab04e6978e358ad2307bca563fabd6c5d2c58b (diff)
parent9d2e157d970a73b3f270b631828e03eb452d525e (diff)
Merge branch 'for-2.6.39/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.39/core' of git://git.kernel.dk/linux-2.6-block: (65 commits) Documentation/iostats.txt: bit-size reference etc. cfq-iosched: removing unnecessary think time checking cfq-iosched: Don't clear queue stats when preempt. blk-throttle: Reset group slice when limits are changed blk-cgroup: Only give unaccounted_time under debug cfq-iosched: Don't set active queue in preempt block: fix non-atomic access to genhd inflight structures block: attempt to merge with existing requests on plug flush block: NULL dereference on error path in __blkdev_get() cfq-iosched: Don't update group weights when on service tree fs: assign sb->s_bdi to default_backing_dev_info if the bdi is going away block: Require subsystems to explicitly allocate bio_set integrity mempool jbd2: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and explicit plugging jbd: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and explicit plugging fs: make fsync_buffers_list() plug mm: make generic_writepages() use plugging blk-cgroup: Add unaccounted time to timeslice_used. block: fixup plugging stubs for !CONFIG_BLOCK block: remove obsolete comments for blkdev_issue_zeroout. blktrace: Use rq->cmd_flags directly in blk_add_trace_rq. ... Fix up conflicts in fs/{aio.c,super.c}
-rw-r--r--Documentation/block/biodoc.txt5
-rw-r--r--Documentation/cgroups/blkio-controller.txt30
-rw-r--r--Documentation/iostats.txt17
-rw-r--r--block/blk-cgroup.c16
-rw-r--r--block/blk-cgroup.h14
-rw-r--r--block/blk-core.c646
-rw-r--r--block/blk-exec.c4
-rw-r--r--block/blk-flush.c439
-rw-r--r--block/blk-lib.c2
-rw-r--r--block/blk-merge.c6
-rw-r--r--block/blk-settings.c15
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk-throttle.c139
-rw-r--r--block/blk.h16
-rw-r--r--block/cfq-iosched.c163
-rw-r--r--block/cfq.h6
-rw-r--r--block/deadline-iosched.c9
-rw-r--r--block/elevator.c108
-rw-r--r--block/genhd.c18
-rw-r--r--block/noop-iosched.c8
-rw-r--r--drivers/block/DAC960.c8
-rw-r--r--drivers/block/amiflop.c9
-rw-r--r--drivers/block/ataflop.c14
-rw-r--r--drivers/block/cciss.c6
-rw-r--r--drivers/block/cpqarray.c3
-rw-r--r--drivers/block/drbd/drbd_actlog.c4
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1
-rw-r--r--drivers/block/drbd/drbd_int.h16
-rw-r--r--drivers/block/drbd/drbd_main.c36
-rw-r--r--drivers/block/drbd/drbd_receiver.c29
-rw-r--r--drivers/block/drbd/drbd_req.c4
-rw-r--r--drivers/block/drbd/drbd_worker.c1
-rw-r--r--drivers/block/drbd/drbd_wrappers.h18
-rw-r--r--drivers/block/floppy.c11
-rw-r--r--drivers/block/loop.c16
-rw-r--r--drivers/block/paride/pcd.c18
-rw-r--r--drivers/block/paride/pd.c7
-rw-r--r--drivers/block/paride/pf.c10
-rw-r--r--drivers/block/pktcdvd.c15
-rw-r--r--drivers/block/swim.c8
-rw-r--r--drivers/block/swim3.c11
-rw-r--r--drivers/block/ub.c10
-rw-r--r--drivers/block/umem.c26
-rw-r--r--drivers/block/xsysace.c9
-rw-r--r--drivers/cdrom/gdrom.c16
-rw-r--r--drivers/cdrom/viocd.c17
-rw-r--r--drivers/ide/ide-atapi.c3
-rw-r--r--drivers/ide/ide-cd.c23
-rw-r--r--drivers/ide/ide-cd.h3
-rw-r--r--drivers/ide/ide-cd_ioctl.c8
-rw-r--r--drivers/ide/ide-gd.c14
-rw-r--r--drivers/ide/ide-io.c4
-rw-r--r--drivers/ide/ide-park.c2
-rw-r--r--drivers/md/bitmap.c5
-rw-r--r--drivers/md/dm-crypt.c9
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-kcopyd.c55
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-table.c31
-rw-r--r--drivers/md/dm.c52
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/linear.c20
-rw-r--r--drivers/md/md.c20
-rw-r--r--drivers/md/multipath.c38
-rw-r--r--drivers/md/raid0.c19
-rw-r--r--drivers/md/raid1.c91
-rw-r--r--drivers/md/raid10.c97
-rw-r--r--drivers/md/raid5.c63
-rw-r--r--drivers/md/raid5.h2
-rw-r--r--drivers/message/i2o/i2o_block.c17
-rw-r--r--drivers/mmc/card/queue.c3
-rw-r--r--drivers/s390/block/dasd.c2
-rw-r--r--drivers/s390/char/tape_block.c12
-rw-r--r--drivers/scsi/scsi_lib.c44
-rw-r--r--drivers/scsi/scsi_transport_fc.c2
-rw-r--r--drivers/scsi/scsi_transport_sas.c6
-rw-r--r--drivers/staging/hv/blkvsc_drv.c11
-rw-r--r--drivers/staging/westbridge/astoria/block/cyasblkdev_block.c11
-rw-r--r--drivers/target/target_core_iblock.c7
-rw-r--r--fs/adfs/inode.c1
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/aio.c77
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/bfs/file.c1
-rw-r--r--fs/bio-integrity.c3
-rw-r--r--fs/bio.c10
-rw-r--r--fs/block_dev.c27
-rw-r--r--fs/btrfs/disk-io.c79
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/inode.c1
-rw-r--r--fs/btrfs/volumes.c91
-rw-r--r--fs/buffer.c51
-rw-r--r--fs/cifs/file.c30
-rw-r--r--fs/direct-io.c7
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/exofs/inode.c1
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/gfs2/aops.c3
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/lops.c12
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hpfs/file.c1
-rw-r--r--fs/isofs/inode.c1
-rw-r--r--fs/jbd/commit.c22
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/logfs/dev_bdev.c2
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/mpage.c8
-rw-r--r--fs/nilfs2/btnode.c7
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/inode.c1
-rw-r--r--fs/nilfs2/mdt.c9
-rw-r--r--fs/nilfs2/page.c5
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/ntfs/aops.c4
-rw-r--r--fs/ntfs/compress.c3
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/omfs/file.c1
-rw-r--r--fs/partitions/check.c3
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/super.c2
-rw-r--r--fs/sync.c4
-rw-r--r--fs/sysv/itree.c1
-rw-r--r--fs/ubifs/super.c1
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/ufs/inode.c1
-rw-r--r--fs/ufs/truncate.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c13
-rw-r--r--include/linux/backing-dev.h16
-rw-r--r--include/linux/bio.h1
-rw-r--r--include/linux/blk_types.h6
-rw-r--r--include/linux/blkdev.h101
-rw-r--r--include/linux/buffer_head.h1
-rw-r--r--include/linux/device-mapper.h5
-rw-r--r--include/linux/elevator.h10
-rw-r--r--include/linux/fs.h29
-rw-r--r--include/linux/genhd.h12
-rw-r--r--include/linux/pagemap.h12
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/swap.h2
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/sched.c12
-rw-r--r--kernel/trace/blktrace.c15
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/filemap.c74
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/readahead.c18
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c37
-rw-r--r--mm/vmscan.c2
172 files changed, 1520 insertions, 2112 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index b9a83dd24732..2a7b38c832c7 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -963,11 +963,6 @@ elevator_dispatch_fn* fills the dispatch queue with ready requests.
963 963
964elevator_add_req_fn* called to add a new request into the scheduler 964elevator_add_req_fn* called to add a new request into the scheduler
965 965
966elevator_queue_empty_fn returns true if the merge queue is empty.
967 Drivers shouldn't use this, but rather check
968 if elv_next_request is NULL (without losing the
969 request if one exists!)
970
971elevator_former_req_fn 966elevator_former_req_fn
972elevator_latter_req_fn These return the request before or after the 967elevator_latter_req_fn These return the request before or after the
973 one specified in disk sort order. Used by the 968 one specified in disk sort order. Used by the
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 4ed7b5ceeed2..465351d4cf85 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -140,7 +140,7 @@ Proportional weight policy files
140 - Specifies per cgroup weight. This is default weight of the group 140 - Specifies per cgroup weight. This is default weight of the group
141 on all the devices until and unless overridden by per device rule. 141 on all the devices until and unless overridden by per device rule.
142 (See blkio.weight_device). 142 (See blkio.weight_device).
143 Currently allowed range of weights is from 100 to 1000. 143 Currently allowed range of weights is from 10 to 1000.
144 144
145- blkio.weight_device 145- blkio.weight_device
146 - One can specify per cgroup per device rules using this interface. 146 - One can specify per cgroup per device rules using this interface.
@@ -343,34 +343,6 @@ Common files among various policies
343 343
344CFQ sysfs tunable 344CFQ sysfs tunable
345================= 345=================
346/sys/block/<disk>/queue/iosched/group_isolation
347-----------------------------------------------
348
349If group_isolation=1, it provides stronger isolation between groups at the
350expense of throughput. By default group_isolation is 0. In general that
351means that if group_isolation=0, expect fairness for sequential workload
352only. Set group_isolation=1 to see fairness for random IO workload also.
353
354Generally CFQ will put random seeky workload in sync-noidle category. CFQ
355will disable idling on these queues and it does a collective idling on group
356of such queues. Generally these are slow moving queues and if there is a
357sync-noidle service tree in each group, that group gets exclusive access to
358disk for certain period. That means it will bring the throughput down if
359group does not have enough IO to drive deeper queue depths and utilize disk
360capacity to the fullest in the slice allocated to it. But the flip side is
361that even a random reader should get better latencies and overall throughput
362if there are lots of sequential readers/sync-idle workload running in the
363system.
364
365If group_isolation=0, then CFQ automatically moves all the random seeky queues
366in the root group. That means there will be no service differentiation for
367that kind of workload. This leads to better throughput as we do collective
368idling on root sync-noidle tree.
369
370By default one should run with group_isolation=0. If that is not sufficient
371and one wants stronger isolation between groups, then set group_isolation=1
372but this will come at cost of reduced throughput.
373
374/sys/block/<disk>/queue/iosched/slice_idle 346/sys/block/<disk>/queue/iosched/slice_idle
375------------------------------------------ 347------------------------------------------
376On a faster hardware CFQ can be slow, especially with sequential workload. 348On a faster hardware CFQ can be slow, especially with sequential workload.
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
index f6dece5b7014..c76c21d87e85 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -1,8 +1,6 @@
1I/O statistics fields 1I/O statistics fields
2--------------- 2---------------
3 3
4Last modified Sep 30, 2003
5
6Since 2.4.20 (and some versions before, with patches), and 2.5.45, 4Since 2.4.20 (and some versions before, with patches), and 2.5.45,
7more extensive disk statistics have been introduced to help measure disk 5more extensive disk statistics have been introduced to help measure disk
8activity. Tools such as sar and iostat typically interpret these and do 6activity. Tools such as sar and iostat typically interpret these and do
@@ -46,11 +44,12 @@ the above example, the first field of statistics would be 446216.
46By contrast, in 2.6 if you look at /sys/block/hda/stat, you'll 44By contrast, in 2.6 if you look at /sys/block/hda/stat, you'll
47find just the eleven fields, beginning with 446216. If you look at 45find just the eleven fields, beginning with 446216. If you look at
48/proc/diskstats, the eleven fields will be preceded by the major and 46/proc/diskstats, the eleven fields will be preceded by the major and
49minor device numbers, and device name. Each of these formats provide 47minor device numbers, and device name. Each of these formats provides
50eleven fields of statistics, each meaning exactly the same things. 48eleven fields of statistics, each meaning exactly the same things.
51All fields except field 9 are cumulative since boot. Field 9 should 49All fields except field 9 are cumulative since boot. Field 9 should
52go to zero as I/Os complete; all others only increase. Yes, these are 50go to zero as I/Os complete; all others only increase (unless they
5332 bit unsigned numbers, and on a very busy or long-lived system they 51overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long
52(native word size) numbers, and on a very busy or long-lived system they
54may wrap. Applications should be prepared to deal with that; unless 53may wrap. Applications should be prepared to deal with that; unless
55your observations are measured in large numbers of minutes or hours, 54your observations are measured in large numbers of minutes or hours,
56they should not wrap twice before you notice them. 55they should not wrap twice before you notice them.
@@ -96,11 +95,11 @@ introduced when changes collide, so (for instance) adding up all the
96read I/Os issued per partition should equal those made to the disks ... 95read I/Os issued per partition should equal those made to the disks ...
97but due to the lack of locking it may only be very close. 96but due to the lack of locking it may only be very close.
98 97
99In 2.6, there are counters for each cpu, which made the lack of locking 98In 2.6, there are counters for each CPU, which make the lack of locking
100almost a non-issue. When the statistics are read, the per-cpu counters 99almost a non-issue. When the statistics are read, the per-CPU counters
101are summed (possibly overflowing the unsigned 32-bit variable they are 100are summed (possibly overflowing the unsigned long variable they are
102summed to) and the result given to the user. There is no convenient 101summed to) and the result given to the user. There is no convenient
103user interface for accessing the per-cpu counters themselves. 102user interface for accessing the per-CPU counters themselves.
104 103
105Disks vs Partitions 104Disks vs Partitions
106------------------- 105-------------------
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a3eb9e..2bef5705ce24 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
371} 371}
372EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 372EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
373 373
374void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) 374void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
375 unsigned long unaccounted_time)
375{ 376{
376 unsigned long flags; 377 unsigned long flags;
377 378
378 spin_lock_irqsave(&blkg->stats_lock, flags); 379 spin_lock_irqsave(&blkg->stats_lock, flags);
379 blkg->stats.time += time; 380 blkg->stats.time += time;
381 blkg->stats.unaccounted_time += unaccounted_time;
380 spin_unlock_irqrestore(&blkg->stats_lock, flags); 382 spin_unlock_irqrestore(&blkg->stats_lock, flags);
381} 383}
382EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 384EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
@@ -604,6 +606,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
604 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 606 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
605 blkg->stats.sectors, cb, dev); 607 blkg->stats.sectors, cb, dev);
606#ifdef CONFIG_DEBUG_BLK_CGROUP 608#ifdef CONFIG_DEBUG_BLK_CGROUP
609 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
610 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
611 blkg->stats.unaccounted_time, cb, dev);
607 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 612 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
608 uint64_t sum = blkg->stats.avg_queue_size_sum; 613 uint64_t sum = blkg->stats.avg_queue_size_sum;
609 uint64_t samples = blkg->stats.avg_queue_size_samples; 614 uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -1125,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1125 return blkio_read_blkg_stats(blkcg, cft, cb, 1130 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_QUEUED, 1); 1131 BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP 1132#ifdef CONFIG_DEBUG_BLK_CGROUP
1133 case BLKIO_PROP_unaccounted_time:
1134 return blkio_read_blkg_stats(blkcg, cft, cb,
1135 BLKIO_STAT_UNACCOUNTED_TIME, 0);
1128 case BLKIO_PROP_dequeue: 1136 case BLKIO_PROP_dequeue:
1129 return blkio_read_blkg_stats(blkcg, cft, cb, 1137 return blkio_read_blkg_stats(blkcg, cft, cb,
1130 BLKIO_STAT_DEQUEUE, 0); 1138 BLKIO_STAT_DEQUEUE, 0);
@@ -1382,6 +1390,12 @@ struct cftype blkio_files[] = {
1382 BLKIO_PROP_dequeue), 1390 BLKIO_PROP_dequeue),
1383 .read_map = blkiocg_file_read_map, 1391 .read_map = blkiocg_file_read_map,
1384 }, 1392 },
1393 {
1394 .name = "unaccounted_time",
1395 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 BLKIO_PROP_unaccounted_time),
1397 .read_map = blkiocg_file_read_map,
1398 },
1385#endif 1399#endif
1386}; 1400};
1387 1401
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861bdd549..10919fae2d3a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,6 +49,8 @@ enum stat_type {
49 /* All the single valued stats go below this */ 49 /* All the single valued stats go below this */
50 BLKIO_STAT_TIME, 50 BLKIO_STAT_TIME,
51 BLKIO_STAT_SECTORS, 51 BLKIO_STAT_SECTORS,
52 /* Time not charged to this cgroup */
53 BLKIO_STAT_UNACCOUNTED_TIME,
52#ifdef CONFIG_DEBUG_BLK_CGROUP 54#ifdef CONFIG_DEBUG_BLK_CGROUP
53 BLKIO_STAT_AVG_QUEUE_SIZE, 55 BLKIO_STAT_AVG_QUEUE_SIZE,
54 BLKIO_STAT_IDLE_TIME, 56 BLKIO_STAT_IDLE_TIME,
@@ -81,6 +83,7 @@ enum blkcg_file_name_prop {
81 BLKIO_PROP_io_serviced, 83 BLKIO_PROP_io_serviced,
82 BLKIO_PROP_time, 84 BLKIO_PROP_time,
83 BLKIO_PROP_sectors, 85 BLKIO_PROP_sectors,
86 BLKIO_PROP_unaccounted_time,
84 BLKIO_PROP_io_service_time, 87 BLKIO_PROP_io_service_time,
85 BLKIO_PROP_io_wait_time, 88 BLKIO_PROP_io_wait_time,
86 BLKIO_PROP_io_merged, 89 BLKIO_PROP_io_merged,
@@ -114,6 +117,8 @@ struct blkio_group_stats {
114 /* total disk time and nr sectors dispatched by this group */ 117 /* total disk time and nr sectors dispatched by this group */
115 uint64_t time; 118 uint64_t time;
116 uint64_t sectors; 119 uint64_t sectors;
120 /* Time not charged to this cgroup */
121 uint64_t unaccounted_time;
117 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; 122 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
118#ifdef CONFIG_DEBUG_BLK_CGROUP 123#ifdef CONFIG_DEBUG_BLK_CGROUP
119 /* Sum of number of IOs queued across all samples */ 124 /* Sum of number of IOs queued across all samples */
@@ -240,7 +245,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
240 245
241#endif 246#endif
242 247
243#define BLKIO_WEIGHT_MIN 100 248#define BLKIO_WEIGHT_MIN 10
244#define BLKIO_WEIGHT_MAX 1000 249#define BLKIO_WEIGHT_MAX 1000
245#define BLKIO_WEIGHT_DEFAULT 500 250#define BLKIO_WEIGHT_DEFAULT 500
246 251
@@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
293extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 298extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
294 void *key); 299 void *key);
295void blkiocg_update_timeslice_used(struct blkio_group *blkg, 300void blkiocg_update_timeslice_used(struct blkio_group *blkg,
296 unsigned long time); 301 unsigned long time,
302 unsigned long unaccounted_time);
297void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, 303void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
298 bool direction, bool sync); 304 bool direction, bool sync);
299void blkiocg_update_completion_stats(struct blkio_group *blkg, 305void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
319static inline struct blkio_group * 325static inline struct blkio_group *
320blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 326blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
321static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, 327static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
322 unsigned long time) {} 328 unsigned long time,
329 unsigned long unaccounted_time)
330{}
323static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 331static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
324 uint64_t bytes, bool direction, bool sync) {} 332 uint64_t bytes, bool direction, bool sync) {}
325static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, 333static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index a63336d49f30..59b5c00c0126 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h>
30 31
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/block.h> 33#include <trace/events/block.h>
@@ -149,39 +150,29 @@ EXPORT_SYMBOL(blk_rq_init);
149static void req_bio_endio(struct request *rq, struct bio *bio, 150static void req_bio_endio(struct request *rq, struct bio *bio,
150 unsigned int nbytes, int error) 151 unsigned int nbytes, int error)
151{ 152{
152 struct request_queue *q = rq->q; 153 if (error)
153 154 clear_bit(BIO_UPTODATE, &bio->bi_flags);
154 if (&q->flush_rq != rq) { 155 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
155 if (error) 156 error = -EIO;
156 clear_bit(BIO_UPTODATE, &bio->bi_flags);
157 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
158 error = -EIO;
159 157
160 if (unlikely(nbytes > bio->bi_size)) { 158 if (unlikely(nbytes > bio->bi_size)) {
161 printk(KERN_ERR "%s: want %u bytes done, %u left\n", 159 printk(KERN_ERR "%s: want %u bytes done, %u left\n",
162 __func__, nbytes, bio->bi_size); 160 __func__, nbytes, bio->bi_size);
163 nbytes = bio->bi_size; 161 nbytes = bio->bi_size;
164 } 162 }
165 163
166 if (unlikely(rq->cmd_flags & REQ_QUIET)) 164 if (unlikely(rq->cmd_flags & REQ_QUIET))
167 set_bit(BIO_QUIET, &bio->bi_flags); 165 set_bit(BIO_QUIET, &bio->bi_flags);
168 166
169 bio->bi_size -= nbytes; 167 bio->bi_size -= nbytes;
170 bio->bi_sector += (nbytes >> 9); 168 bio->bi_sector += (nbytes >> 9);
171 169
172 if (bio_integrity(bio)) 170 if (bio_integrity(bio))
173 bio_integrity_advance(bio, nbytes); 171 bio_integrity_advance(bio, nbytes);
174 172
175 if (bio->bi_size == 0) 173 /* don't actually finish bio if it's part of flush sequence */
176 bio_endio(bio, error); 174 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
177 } else { 175 bio_endio(bio, error);
178 /*
179 * Okay, this is the sequenced flush request in
180 * progress, just record the error;
181 */
182 if (error && !q->flush_err)
183 q->flush_err = error;
184 }
185} 176}
186 177
187void blk_dump_rq_flags(struct request *rq, char *msg) 178void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -208,135 +199,43 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
208EXPORT_SYMBOL(blk_dump_rq_flags); 199EXPORT_SYMBOL(blk_dump_rq_flags);
209 200
210/* 201/*
211 * "plug" the device if there are no outstanding requests: this will 202 * Make sure that plugs that were pending when this function was entered,
212 * force the transfer to start only after we have put all the requests 203 * are now complete and requests pushed to the queue.
213 * on the list. 204*/
214 * 205static inline void queue_sync_plugs(struct request_queue *q)
215 * This is called with interrupts off and no requests on the queue and
216 * with the queue lock held.
217 */
218void blk_plug_device(struct request_queue *q)
219{ 206{
220 WARN_ON(!irqs_disabled());
221
222 /* 207 /*
223 * don't plug a stopped queue, it must be paired with blk_start_queue() 208 * If the current process is plugged and has barriers submitted,
224 * which will restart the queueing 209 * we will livelock if we don't unplug first.
225 */ 210 */
226 if (blk_queue_stopped(q)) 211 blk_flush_plug(current);
227 return;
228
229 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
230 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
231 trace_block_plug(q);
232 }
233}
234EXPORT_SYMBOL(blk_plug_device);
235
236/**
237 * blk_plug_device_unlocked - plug a device without queue lock held
238 * @q: The &struct request_queue to plug
239 *
240 * Description:
241 * Like @blk_plug_device(), but grabs the queue lock and disables
242 * interrupts.
243 **/
244void blk_plug_device_unlocked(struct request_queue *q)
245{
246 unsigned long flags;
247
248 spin_lock_irqsave(q->queue_lock, flags);
249 blk_plug_device(q);
250 spin_unlock_irqrestore(q->queue_lock, flags);
251}
252EXPORT_SYMBOL(blk_plug_device_unlocked);
253
254/*
255 * remove the queue from the plugged list, if present. called with
256 * queue lock held and interrupts disabled.
257 */
258int blk_remove_plug(struct request_queue *q)
259{
260 WARN_ON(!irqs_disabled());
261
262 if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
263 return 0;
264
265 del_timer(&q->unplug_timer);
266 return 1;
267} 212}
268EXPORT_SYMBOL(blk_remove_plug);
269 213
270/* 214static void blk_delay_work(struct work_struct *work)
271 * remove the plug and let it rip..
272 */
273void __generic_unplug_device(struct request_queue *q)
274{ 215{
275 if (unlikely(blk_queue_stopped(q))) 216 struct request_queue *q;
276 return;
277 if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
278 return;
279 217
280 q->request_fn(q); 218 q = container_of(work, struct request_queue, delay_work.work);
219 spin_lock_irq(q->queue_lock);
220 __blk_run_queue(q, false);
221 spin_unlock_irq(q->queue_lock);
281} 222}
282 223
283/** 224/**
284 * generic_unplug_device - fire a request queue 225 * blk_delay_queue - restart queueing after defined interval
285 * @q: The &struct request_queue in question 226 * @q: The &struct request_queue in question
227 * @msecs: Delay in msecs
286 * 228 *
287 * Description: 229 * Description:
288 * Linux uses plugging to build bigger requests queues before letting 230 * Sometimes queueing needs to be postponed for a little while, to allow
289 * the device have at them. If a queue is plugged, the I/O scheduler 231 * resources to come back. This function will make sure that queueing is
290 * is still adding and merging requests on the queue. Once the queue 232 * restarted around the specified time.
291 * gets unplugged, the request_fn defined for the queue is invoked and 233 */
292 * transfers started. 234void blk_delay_queue(struct request_queue *q, unsigned long msecs)
293 **/
294void generic_unplug_device(struct request_queue *q)
295{
296 if (blk_queue_plugged(q)) {
297 spin_lock_irq(q->queue_lock);
298 __generic_unplug_device(q);
299 spin_unlock_irq(q->queue_lock);
300 }
301}
302EXPORT_SYMBOL(generic_unplug_device);
303
304static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
305 struct page *page)
306{
307 struct request_queue *q = bdi->unplug_io_data;
308
309 blk_unplug(q);
310}
311
312void blk_unplug_work(struct work_struct *work)
313{
314 struct request_queue *q =
315 container_of(work, struct request_queue, unplug_work);
316
317 trace_block_unplug_io(q);
318 q->unplug_fn(q);
319}
320
321void blk_unplug_timeout(unsigned long data)
322{
323 struct request_queue *q = (struct request_queue *)data;
324
325 trace_block_unplug_timer(q);
326 kblockd_schedule_work(q, &q->unplug_work);
327}
328
329void blk_unplug(struct request_queue *q)
330{ 235{
331 /* 236 schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
332 * devices don't necessarily have an ->unplug_fn defined
333 */
334 if (q->unplug_fn) {
335 trace_block_unplug_io(q);
336 q->unplug_fn(q);
337 }
338} 237}
339EXPORT_SYMBOL(blk_unplug); 238EXPORT_SYMBOL(blk_delay_queue);
340 239
341/** 240/**
342 * blk_start_queue - restart a previously stopped queue 241 * blk_start_queue - restart a previously stopped queue
@@ -372,7 +271,7 @@ EXPORT_SYMBOL(blk_start_queue);
372 **/ 271 **/
373void blk_stop_queue(struct request_queue *q) 272void blk_stop_queue(struct request_queue *q)
374{ 273{
375 blk_remove_plug(q); 274 cancel_delayed_work(&q->delay_work);
376 queue_flag_set(QUEUE_FLAG_STOPPED, q); 275 queue_flag_set(QUEUE_FLAG_STOPPED, q);
377} 276}
378EXPORT_SYMBOL(blk_stop_queue); 277EXPORT_SYMBOL(blk_stop_queue);
@@ -390,13 +289,16 @@ EXPORT_SYMBOL(blk_stop_queue);
390 * that its ->make_request_fn will not re-add plugging prior to calling 289 * that its ->make_request_fn will not re-add plugging prior to calling
391 * this function. 290 * this function.
392 * 291 *
292 * This function does not cancel any asynchronous activity arising
293 * out of elevator or throttling code. That would require elevaotor_exit()
294 * and blk_throtl_exit() to be called with queue lock initialized.
295 *
393 */ 296 */
394void blk_sync_queue(struct request_queue *q) 297void blk_sync_queue(struct request_queue *q)
395{ 298{
396 del_timer_sync(&q->unplug_timer);
397 del_timer_sync(&q->timeout); 299 del_timer_sync(&q->timeout);
398 cancel_work_sync(&q->unplug_work); 300 cancel_delayed_work_sync(&q->delay_work);
399 throtl_shutdown_timer_wq(q); 301 queue_sync_plugs(q);
400} 302}
401EXPORT_SYMBOL(blk_sync_queue); 303EXPORT_SYMBOL(blk_sync_queue);
402 304
@@ -412,14 +314,9 @@ EXPORT_SYMBOL(blk_sync_queue);
412 */ 314 */
413void __blk_run_queue(struct request_queue *q, bool force_kblockd) 315void __blk_run_queue(struct request_queue *q, bool force_kblockd)
414{ 316{
415 blk_remove_plug(q);
416
417 if (unlikely(blk_queue_stopped(q))) 317 if (unlikely(blk_queue_stopped(q)))
418 return; 318 return;
419 319
420 if (elv_queue_empty(q))
421 return;
422
423 /* 320 /*
424 * Only recurse once to avoid overrunning the stack, let the unplug 321 * Only recurse once to avoid overrunning the stack, let the unplug
425 * handling reinvoke the handler shortly if we already got there. 322 * handling reinvoke the handler shortly if we already got there.
@@ -427,10 +324,8 @@ void __blk_run_queue(struct request_queue *q, bool force_kblockd)
427 if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { 324 if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
428 q->request_fn(q); 325 q->request_fn(q);
429 queue_flag_clear(QUEUE_FLAG_REENTER, q); 326 queue_flag_clear(QUEUE_FLAG_REENTER, q);
430 } else { 327 } else
431 queue_flag_set(QUEUE_FLAG_PLUGGED, q); 328 queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
432 kblockd_schedule_work(q, &q->unplug_work);
433 }
434} 329}
435EXPORT_SYMBOL(__blk_run_queue); 330EXPORT_SYMBOL(__blk_run_queue);
436 331
@@ -457,6 +352,11 @@ void blk_put_queue(struct request_queue *q)
457 kobject_put(&q->kobj); 352 kobject_put(&q->kobj);
458} 353}
459 354
355/*
356 * Note: If a driver supplied the queue lock, it should not zap that lock
357 * unexpectedly as some queue cleanup components like elevator_exit() and
358 * blk_throtl_exit() need queue lock.
359 */
460void blk_cleanup_queue(struct request_queue *q) 360void blk_cleanup_queue(struct request_queue *q)
461{ 361{
462 /* 362 /*
@@ -475,6 +375,8 @@ void blk_cleanup_queue(struct request_queue *q)
475 if (q->elevator) 375 if (q->elevator)
476 elevator_exit(q->elevator); 376 elevator_exit(q->elevator);
477 377
378 blk_throtl_exit(q);
379
478 blk_put_queue(q); 380 blk_put_queue(q);
479} 381}
480EXPORT_SYMBOL(blk_cleanup_queue); 382EXPORT_SYMBOL(blk_cleanup_queue);
@@ -517,8 +419,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
517 if (!q) 419 if (!q)
518 return NULL; 420 return NULL;
519 421
520 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
521 q->backing_dev_info.unplug_io_data = q;
522 q->backing_dev_info.ra_pages = 422 q->backing_dev_info.ra_pages =
523 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 423 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
524 q->backing_dev_info.state = 0; 424 q->backing_dev_info.state = 0;
@@ -538,17 +438,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
538 438
539 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 439 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
540 laptop_mode_timer_fn, (unsigned long) q); 440 laptop_mode_timer_fn, (unsigned long) q);
541 init_timer(&q->unplug_timer);
542 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 441 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
543 INIT_LIST_HEAD(&q->timeout_list); 442 INIT_LIST_HEAD(&q->timeout_list);
544 INIT_LIST_HEAD(&q->pending_flushes); 443 INIT_LIST_HEAD(&q->flush_queue[0]);
545 INIT_WORK(&q->unplug_work, blk_unplug_work); 444 INIT_LIST_HEAD(&q->flush_queue[1]);
445 INIT_LIST_HEAD(&q->flush_data_in_flight);
446 INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
546 447
547 kobject_init(&q->kobj, &blk_queue_ktype); 448 kobject_init(&q->kobj, &blk_queue_ktype);
548 449
549 mutex_init(&q->sysfs_lock); 450 mutex_init(&q->sysfs_lock);
550 spin_lock_init(&q->__queue_lock); 451 spin_lock_init(&q->__queue_lock);
551 452
453 /*
454 * By default initialize queue_lock to internal lock and driver can
455 * override it later if need be.
456 */
457 q->queue_lock = &q->__queue_lock;
458
552 return q; 459 return q;
553} 460}
554EXPORT_SYMBOL(blk_alloc_queue_node); 461EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -631,9 +538,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
631 q->request_fn = rfn; 538 q->request_fn = rfn;
632 q->prep_rq_fn = NULL; 539 q->prep_rq_fn = NULL;
633 q->unprep_rq_fn = NULL; 540 q->unprep_rq_fn = NULL;
634 q->unplug_fn = generic_unplug_device;
635 q->queue_flags = QUEUE_FLAG_DEFAULT; 541 q->queue_flags = QUEUE_FLAG_DEFAULT;
636 q->queue_lock = lock; 542
543 /* Override internal queue lock with supplied lock pointer */
544 if (lock)
545 q->queue_lock = lock;
637 546
638 /* 547 /*
639 * This also sets hw/phys segments, boundary and size 548 * This also sets hw/phys segments, boundary and size
@@ -666,6 +575,8 @@ int blk_get_queue(struct request_queue *q)
666 575
667static inline void blk_free_request(struct request_queue *q, struct request *rq) 576static inline void blk_free_request(struct request_queue *q, struct request *rq)
668{ 577{
578 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
579
669 if (rq->cmd_flags & REQ_ELVPRIV) 580 if (rq->cmd_flags & REQ_ELVPRIV)
670 elv_put_request(q, rq); 581 elv_put_request(q, rq);
671 mempool_free(rq, q->rq.rq_pool); 582 mempool_free(rq, q->rq.rq_pool);
@@ -762,6 +673,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
762} 673}
763 674
764/* 675/*
676 * Determine if elevator data should be initialized when allocating the
677 * request associated with @bio.
678 */
679static bool blk_rq_should_init_elevator(struct bio *bio)
680{
681 if (!bio)
682 return true;
683
684 /*
685 * Flush requests do not use the elevator so skip initialization.
686 * This allows a request to share the flush and elevator data.
687 */
688 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
689 return false;
690
691 return true;
692}
693
694/*
765 * Get a free request, queue_lock must be held. 695 * Get a free request, queue_lock must be held.
766 * Returns NULL on failure, with queue_lock held. 696 * Returns NULL on failure, with queue_lock held.
767 * Returns !NULL on success, with queue_lock *not held*. 697 * Returns !NULL on success, with queue_lock *not held*.
@@ -773,7 +703,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
773 struct request_list *rl = &q->rq; 703 struct request_list *rl = &q->rq;
774 struct io_context *ioc = NULL; 704 struct io_context *ioc = NULL;
775 const bool is_sync = rw_is_sync(rw_flags) != 0; 705 const bool is_sync = rw_is_sync(rw_flags) != 0;
776 int may_queue, priv; 706 int may_queue, priv = 0;
777 707
778 may_queue = elv_may_queue(q, rw_flags); 708 may_queue = elv_may_queue(q, rw_flags);
779 if (may_queue == ELV_MQUEUE_NO) 709 if (may_queue == ELV_MQUEUE_NO)
@@ -817,9 +747,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
817 rl->count[is_sync]++; 747 rl->count[is_sync]++;
818 rl->starved[is_sync] = 0; 748 rl->starved[is_sync] = 0;
819 749
820 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 750 if (blk_rq_should_init_elevator(bio)) {
821 if (priv) 751 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
822 rl->elvpriv++; 752 if (priv)
753 rl->elvpriv++;
754 }
823 755
824 if (blk_queue_io_stat(q)) 756 if (blk_queue_io_stat(q))
825 rw_flags |= REQ_IO_STAT; 757 rw_flags |= REQ_IO_STAT;
@@ -866,8 +798,8 @@ out:
866} 798}
867 799
868/* 800/*
869 * No available requests for this queue, unplug the device and wait for some 801 * No available requests for this queue, wait for some requests to become
870 * requests to become available. 802 * available.
871 * 803 *
872 * Called with q->queue_lock held, and returns with it unlocked. 804 * Called with q->queue_lock held, and returns with it unlocked.
873 */ 805 */
@@ -888,7 +820,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
888 820
889 trace_block_sleeprq(q, bio, rw_flags & 1); 821 trace_block_sleeprq(q, bio, rw_flags & 1);
890 822
891 __generic_unplug_device(q);
892 spin_unlock_irq(q->queue_lock); 823 spin_unlock_irq(q->queue_lock);
893 io_schedule(); 824 io_schedule();
894 825
@@ -1010,6 +941,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
1010} 941}
1011EXPORT_SYMBOL(blk_requeue_request); 942EXPORT_SYMBOL(blk_requeue_request);
1012 943
944static void add_acct_request(struct request_queue *q, struct request *rq,
945 int where)
946{
947 drive_stat_acct(rq, 1);
948 __elv_add_request(q, rq, where);
949}
950
1013/** 951/**
1014 * blk_insert_request - insert a special request into a request queue 952 * blk_insert_request - insert a special request into a request queue
1015 * @q: request queue where request should be inserted 953 * @q: request queue where request should be inserted
@@ -1052,8 +990,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1052 if (blk_rq_tagged(rq)) 990 if (blk_rq_tagged(rq))
1053 blk_queue_end_tag(q, rq); 991 blk_queue_end_tag(q, rq);
1054 992
1055 drive_stat_acct(rq, 1); 993 add_acct_request(q, rq, where);
1056 __elv_add_request(q, rq, where, 0);
1057 __blk_run_queue(q, false); 994 __blk_run_queue(q, false);
1058 spin_unlock_irqrestore(q->queue_lock, flags); 995 spin_unlock_irqrestore(q->queue_lock, flags);
1059} 996}
@@ -1174,6 +1111,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1174} 1111}
1175EXPORT_SYMBOL_GPL(blk_add_request_payload); 1112EXPORT_SYMBOL_GPL(blk_add_request_payload);
1176 1113
1114static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1115 struct bio *bio)
1116{
1117 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1118
1119 /*
1120 * Debug stuff, kill later
1121 */
1122 if (!rq_mergeable(req)) {
1123 blk_dump_rq_flags(req, "back");
1124 return false;
1125 }
1126
1127 if (!ll_back_merge_fn(q, req, bio))
1128 return false;
1129
1130 trace_block_bio_backmerge(q, bio);
1131
1132 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1133 blk_rq_set_mixed_merge(req);
1134
1135 req->biotail->bi_next = bio;
1136 req->biotail = bio;
1137 req->__data_len += bio->bi_size;
1138 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1139
1140 drive_stat_acct(req, 0);
1141 return true;
1142}
1143
1144static bool bio_attempt_front_merge(struct request_queue *q,
1145 struct request *req, struct bio *bio)
1146{
1147 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1148 sector_t sector;
1149
1150 /*
1151 * Debug stuff, kill later
1152 */
1153 if (!rq_mergeable(req)) {
1154 blk_dump_rq_flags(req, "front");
1155 return false;
1156 }
1157
1158 if (!ll_front_merge_fn(q, req, bio))
1159 return false;
1160
1161 trace_block_bio_frontmerge(q, bio);
1162
1163 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1164 blk_rq_set_mixed_merge(req);
1165
1166 sector = bio->bi_sector;
1167
1168 bio->bi_next = req->bio;
1169 req->bio = bio;
1170
1171 /*
1172 * may not be valid. if the low level driver said
1173 * it didn't need a bounce buffer then it better
1174 * not touch req->buffer either...
1175 */
1176 req->buffer = bio_data(bio);
1177 req->__sector = bio->bi_sector;
1178 req->__data_len += bio->bi_size;
1179 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1180
1181 drive_stat_acct(req, 0);
1182 return true;
1183}
1184
1185/*
1186 * Attempts to merge with the plugged list in the current process. Returns
1187 * true if merge was succesful, otherwise false.
1188 */
1189static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
1190 struct bio *bio)
1191{
1192 struct blk_plug *plug;
1193 struct request *rq;
1194 bool ret = false;
1195
1196 plug = tsk->plug;
1197 if (!plug)
1198 goto out;
1199
1200 list_for_each_entry_reverse(rq, &plug->list, queuelist) {
1201 int el_ret;
1202
1203 if (rq->q != q)
1204 continue;
1205
1206 el_ret = elv_try_merge(rq, bio);
1207 if (el_ret == ELEVATOR_BACK_MERGE) {
1208 ret = bio_attempt_back_merge(q, rq, bio);
1209 if (ret)
1210 break;
1211 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1212 ret = bio_attempt_front_merge(q, rq, bio);
1213 if (ret)
1214 break;
1215 }
1216 }
1217out:
1218 return ret;
1219}
1220
1177void init_request_from_bio(struct request *req, struct bio *bio) 1221void init_request_from_bio(struct request *req, struct bio *bio)
1178{ 1222{
1179 req->cpu = bio->bi_comp_cpu; 1223 req->cpu = bio->bi_comp_cpu;
@@ -1189,26 +1233,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1189 blk_rq_bio_prep(req->q, req, bio); 1233 blk_rq_bio_prep(req->q, req, bio);
1190} 1234}
1191 1235
1192/*
1193 * Only disabling plugging for non-rotational devices if it does tagging
1194 * as well, otherwise we do need the proper merging
1195 */
1196static inline bool queue_should_plug(struct request_queue *q)
1197{
1198 return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
1199}
1200
1201static int __make_request(struct request_queue *q, struct bio *bio) 1236static int __make_request(struct request_queue *q, struct bio *bio)
1202{ 1237{
1203 struct request *req;
1204 int el_ret;
1205 unsigned int bytes = bio->bi_size;
1206 const unsigned short prio = bio_prio(bio);
1207 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1238 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1208 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); 1239 struct blk_plug *plug;
1209 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; 1240 int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
1210 int where = ELEVATOR_INSERT_SORT; 1241 struct request *req;
1211 int rw_flags;
1212 1242
1213 /* 1243 /*
1214 * low level driver can indicate that it wants pages above a 1244 * low level driver can indicate that it wants pages above a
@@ -1217,78 +1247,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1217 */ 1247 */
1218 blk_queue_bounce(q, &bio); 1248 blk_queue_bounce(q, &bio);
1219 1249
1220 spin_lock_irq(q->queue_lock);
1221
1222 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1250 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1223 where = ELEVATOR_INSERT_FRONT; 1251 spin_lock_irq(q->queue_lock);
1252 where = ELEVATOR_INSERT_FLUSH;
1224 goto get_rq; 1253 goto get_rq;
1225 } 1254 }
1226 1255
1227 if (elv_queue_empty(q)) 1256 /*
1228 goto get_rq; 1257 * Check if we can merge with the plugged list before grabbing
1229 1258 * any locks.
1230 el_ret = elv_merge(q, &req, bio); 1259 */
1231 switch (el_ret) { 1260 if (attempt_plug_merge(current, q, bio))
1232 case ELEVATOR_BACK_MERGE:
1233 BUG_ON(!rq_mergeable(req));
1234
1235 if (!ll_back_merge_fn(q, req, bio))
1236 break;
1237
1238 trace_block_bio_backmerge(q, bio);
1239
1240 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1241 blk_rq_set_mixed_merge(req);
1242
1243 req->biotail->bi_next = bio;
1244 req->biotail = bio;
1245 req->__data_len += bytes;
1246 req->ioprio = ioprio_best(req->ioprio, prio);
1247 if (!blk_rq_cpu_valid(req))
1248 req->cpu = bio->bi_comp_cpu;
1249 drive_stat_acct(req, 0);
1250 elv_bio_merged(q, req, bio);
1251 if (!attempt_back_merge(q, req))
1252 elv_merged_request(q, req, el_ret);
1253 goto out; 1261 goto out;
1254 1262
1255 case ELEVATOR_FRONT_MERGE: 1263 spin_lock_irq(q->queue_lock);
1256 BUG_ON(!rq_mergeable(req));
1257
1258 if (!ll_front_merge_fn(q, req, bio))
1259 break;
1260
1261 trace_block_bio_frontmerge(q, bio);
1262 1264
1263 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { 1265 el_ret = elv_merge(q, &req, bio);
1264 blk_rq_set_mixed_merge(req); 1266 if (el_ret == ELEVATOR_BACK_MERGE) {
1265 req->cmd_flags &= ~REQ_FAILFAST_MASK; 1267 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1266 req->cmd_flags |= ff; 1268 if (bio_attempt_back_merge(q, req, bio)) {
1269 if (!attempt_back_merge(q, req))
1270 elv_merged_request(q, req, el_ret);
1271 goto out_unlock;
1272 }
1273 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1274 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1275 if (bio_attempt_front_merge(q, req, bio)) {
1276 if (!attempt_front_merge(q, req))
1277 elv_merged_request(q, req, el_ret);
1278 goto out_unlock;
1267 } 1279 }
1268
1269 bio->bi_next = req->bio;
1270 req->bio = bio;
1271
1272 /*
1273 * may not be valid. if the low level driver said
1274 * it didn't need a bounce buffer then it better
1275 * not touch req->buffer either...
1276 */
1277 req->buffer = bio_data(bio);
1278 req->__sector = bio->bi_sector;
1279 req->__data_len += bytes;
1280 req->ioprio = ioprio_best(req->ioprio, prio);
1281 if (!blk_rq_cpu_valid(req))
1282 req->cpu = bio->bi_comp_cpu;
1283 drive_stat_acct(req, 0);
1284 elv_bio_merged(q, req, bio);
1285 if (!attempt_front_merge(q, req))
1286 elv_merged_request(q, req, el_ret);
1287 goto out;
1288
1289 /* ELV_NO_MERGE: elevator says don't/can't merge. */
1290 default:
1291 ;
1292 } 1280 }
1293 1281
1294get_rq: 1282get_rq:
@@ -1315,20 +1303,35 @@ get_rq:
1315 */ 1303 */
1316 init_request_from_bio(req, bio); 1304 init_request_from_bio(req, bio);
1317 1305
1318 spin_lock_irq(q->queue_lock);
1319 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1306 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1320 bio_flagged(bio, BIO_CPU_AFFINE)) 1307 bio_flagged(bio, BIO_CPU_AFFINE)) {
1321 req->cpu = blk_cpu_to_group(smp_processor_id()); 1308 req->cpu = blk_cpu_to_group(get_cpu());
1322 if (queue_should_plug(q) && elv_queue_empty(q)) 1309 put_cpu();
1323 blk_plug_device(q); 1310 }
1324 1311
1325 /* insert the request into the elevator */ 1312 plug = current->plug;
1326 drive_stat_acct(req, 1); 1313 if (plug) {
1327 __elv_add_request(q, req, where, 0); 1314 if (!plug->should_sort && !list_empty(&plug->list)) {
1315 struct request *__rq;
1316
1317 __rq = list_entry_rq(plug->list.prev);
1318 if (__rq->q != q)
1319 plug->should_sort = 1;
1320 }
1321 /*
1322 * Debug flag, kill later
1323 */
1324 req->cmd_flags |= REQ_ON_PLUG;
1325 list_add_tail(&req->queuelist, &plug->list);
1326 drive_stat_acct(req, 1);
1327 } else {
1328 spin_lock_irq(q->queue_lock);
1329 add_acct_request(q, req, where);
1330 __blk_run_queue(q, false);
1331out_unlock:
1332 spin_unlock_irq(q->queue_lock);
1333 }
1328out: 1334out:
1329 if (unplug || !queue_should_plug(q))
1330 __generic_unplug_device(q);
1331 spin_unlock_irq(q->queue_lock);
1332 return 0; 1335 return 0;
1333} 1336}
1334 1337
@@ -1731,9 +1734,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1731 */ 1734 */
1732 BUG_ON(blk_queued_rq(rq)); 1735 BUG_ON(blk_queued_rq(rq));
1733 1736
1734 drive_stat_acct(rq, 1); 1737 add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
1735 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1736
1737 spin_unlock_irqrestore(q->queue_lock, flags); 1738 spin_unlock_irqrestore(q->queue_lock, flags);
1738 1739
1739 return 0; 1740 return 0;
@@ -1805,7 +1806,7 @@ static void blk_account_io_done(struct request *req)
1805 * normal IO on queueing nor completion. Accounting the 1806 * normal IO on queueing nor completion. Accounting the
1806 * containing request is enough. 1807 * containing request is enough.
1807 */ 1808 */
1808 if (blk_do_io_stat(req) && req != &req->q->flush_rq) { 1809 if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
1809 unsigned long duration = jiffies - req->start_time; 1810 unsigned long duration = jiffies - req->start_time;
1810 const int rw = rq_data_dir(req); 1811 const int rw = rq_data_dir(req);
1811 struct hd_struct *part; 1812 struct hd_struct *part;
@@ -2628,6 +2629,113 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2628} 2629}
2629EXPORT_SYMBOL(kblockd_schedule_work); 2630EXPORT_SYMBOL(kblockd_schedule_work);
2630 2631
2632int kblockd_schedule_delayed_work(struct request_queue *q,
2633 struct delayed_work *dwork, unsigned long delay)
2634{
2635 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2636}
2637EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2638
2639#define PLUG_MAGIC 0x91827364
2640
2641void blk_start_plug(struct blk_plug *plug)
2642{
2643 struct task_struct *tsk = current;
2644
2645 plug->magic = PLUG_MAGIC;
2646 INIT_LIST_HEAD(&plug->list);
2647 plug->should_sort = 0;
2648
2649 /*
2650 * If this is a nested plug, don't actually assign it. It will be
2651 * flushed on its own.
2652 */
2653 if (!tsk->plug) {
2654 /*
2655 * Store ordering should not be needed here, since a potential
2656 * preempt will imply a full memory barrier
2657 */
2658 tsk->plug = plug;
2659 }
2660}
2661EXPORT_SYMBOL(blk_start_plug);
2662
2663static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
2664{
2665 struct request *rqa = container_of(a, struct request, queuelist);
2666 struct request *rqb = container_of(b, struct request, queuelist);
2667
2668 return !(rqa->q == rqb->q);
2669}
2670
2671static void flush_plug_list(struct blk_plug *plug)
2672{
2673 struct request_queue *q;
2674 unsigned long flags;
2675 struct request *rq;
2676
2677 BUG_ON(plug->magic != PLUG_MAGIC);
2678
2679 if (list_empty(&plug->list))
2680 return;
2681
2682 if (plug->should_sort)
2683 list_sort(NULL, &plug->list, plug_rq_cmp);
2684
2685 q = NULL;
2686 local_irq_save(flags);
2687 while (!list_empty(&plug->list)) {
2688 rq = list_entry_rq(plug->list.next);
2689 list_del_init(&rq->queuelist);
2690 BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
2691 BUG_ON(!rq->q);
2692 if (rq->q != q) {
2693 if (q) {
2694 __blk_run_queue(q, false);
2695 spin_unlock(q->queue_lock);
2696 }
2697 q = rq->q;
2698 spin_lock(q->queue_lock);
2699 }
2700 rq->cmd_flags &= ~REQ_ON_PLUG;
2701
2702 /*
2703 * rq is already accounted, so use raw insert
2704 */
2705 __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
2706 }
2707
2708 if (q) {
2709 __blk_run_queue(q, false);
2710 spin_unlock(q->queue_lock);
2711 }
2712
2713 BUG_ON(!list_empty(&plug->list));
2714 local_irq_restore(flags);
2715}
2716
2717static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
2718{
2719 flush_plug_list(plug);
2720
2721 if (plug == tsk->plug)
2722 tsk->plug = NULL;
2723}
2724
2725void blk_finish_plug(struct blk_plug *plug)
2726{
2727 if (plug)
2728 __blk_finish_plug(current, plug);
2729}
2730EXPORT_SYMBOL(blk_finish_plug);
2731
2732void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
2733{
2734 __blk_finish_plug(tsk, plug);
2735 tsk->plug = plug;
2736}
2737EXPORT_SYMBOL(__blk_flush_plug);
2738
2631int __init blk_dev_init(void) 2739int __init blk_dev_init(void)
2632{ 2740{
2633 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2741 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index cf1456a02acd..7482b7fa863b 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
54 rq->end_io = done; 54 rq->end_io = done;
55 WARN_ON(irqs_disabled()); 55 WARN_ON(irqs_disabled());
56 spin_lock_irq(q->queue_lock); 56 spin_lock_irq(q->queue_lock);
57 __elv_add_request(q, rq, where, 1); 57 __elv_add_request(q, rq, where);
58 __generic_unplug_device(q); 58 __blk_run_queue(q, false);
59 /* the queue is stopped so it won't be plugged+unplugged */ 59 /* the queue is stopped so it won't be plugged+unplugged */
60 if (rq->cmd_type == REQ_TYPE_PM_RESUME) 60 if (rq->cmd_type == REQ_TYPE_PM_RESUME)
61 q->request_fn(q); 61 q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b27d0208611b..93d5fd8e51eb 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
1/* 1/*
2 * Functions to sequence FLUSH and FUA writes. 2 * Functions to sequence FLUSH and FUA writes.
3 *
4 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
5 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
11 * properties and hardware capability.
12 *
13 * If a request doesn't have data, only REQ_FLUSH makes sense, which
14 * indicates a simple flush request. If there is data, REQ_FLUSH indicates
15 * that the device cache should be flushed before the data is executed, and
16 * REQ_FUA means that the data must be on non-volatile media on request
17 * completion.
18 *
19 * If the device doesn't have writeback cache, FLUSH and FUA don't make any
20 * difference. The requests are either completed immediately if there's no
21 * data or executed as normal requests otherwise.
22 *
23 * If the device has writeback cache and supports FUA, REQ_FLUSH is
24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
25 *
26 * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
27 * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
28 *
29 * The actual execution of flush is double buffered. Whenever a request
30 * needs to execute PRE or POSTFLUSH, it queues at
31 * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
32 * flush is issued and the pending_idx is toggled. When the flush
33 * completes, all the requests which were pending are proceeded to the next
34 * step. This allows arbitrary merging of different types of FLUSH/FUA
35 * requests.
36 *
37 * Currently, the following conditions are used to determine when to issue
38 * flush.
39 *
40 * C1. At any given time, only one flush shall be in progress. This makes
41 * double buffering sufficient.
42 *
43 * C2. Flush is deferred if any request is executing DATA of its sequence.
44 * This avoids issuing separate POSTFLUSHes for requests which shared
45 * PREFLUSH.
46 *
47 * C3. The second condition is ignored if there is a request which has
48 * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
49 * starvation in the unlikely case where there are continuous stream of
50 * FUA (without FLUSH) requests.
51 *
52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
53 * is beneficial.
54 *
55 * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
56 * Once while executing DATA and again after the whole sequence is
57 * complete. The first completion updates the contained bio but doesn't
58 * finish it so that the bio submitter is notified only after the whole
59 * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
60 * req_bio_endio().
61 *
62 * The above peculiarity requires that each FLUSH/FUA request has only one
63 * bio attached to it, which is guaranteed as they aren't allowed to be
64 * merged in the usual way.
3 */ 65 */
66
4#include <linux/kernel.h> 67#include <linux/kernel.h>
5#include <linux/module.h> 68#include <linux/module.h>
6#include <linux/bio.h> 69#include <linux/bio.h>
@@ -11,58 +74,142 @@
11 74
12/* FLUSH/FUA sequences */ 75/* FLUSH/FUA sequences */
13enum { 76enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ 77 REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ 78 REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ 79 REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ 80 REQ_FSEQ_DONE = (1 << 3),
18 QUEUE_FSEQ_DONE = (1 << 4), 81
82 REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
83 REQ_FSEQ_POSTFLUSH,
84
85 /*
86 * If flush has been pending longer than the following timeout,
87 * it's issued even if flush_data requests are still in flight.
88 */
89 FLUSH_PENDING_TIMEOUT = 5 * HZ,
19}; 90};
20 91
21static struct request *queue_next_fseq(struct request_queue *q); 92static bool blk_kick_flush(struct request_queue *q);
22 93
23unsigned blk_flush_cur_seq(struct request_queue *q) 94static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
24{ 95{
25 if (!q->flush_seq) 96 unsigned int policy = 0;
26 return 0; 97
27 return 1 << ffz(q->flush_seq); 98 if (fflags & REQ_FLUSH) {
99 if (rq->cmd_flags & REQ_FLUSH)
100 policy |= REQ_FSEQ_PREFLUSH;
101 if (blk_rq_sectors(rq))
102 policy |= REQ_FSEQ_DATA;
103 if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
104 policy |= REQ_FSEQ_POSTFLUSH;
105 }
106 return policy;
28} 107}
29 108
30static struct request *blk_flush_complete_seq(struct request_queue *q, 109static unsigned int blk_flush_cur_seq(struct request *rq)
31 unsigned seq, int error)
32{ 110{
33 struct request *next_rq = NULL; 111 return 1 << ffz(rq->flush.seq);
34 112}
35 if (error && !q->flush_err) 113
36 q->flush_err = error; 114static void blk_flush_restore_request(struct request *rq)
37 115{
38 BUG_ON(q->flush_seq & seq); 116 /*
39 q->flush_seq |= seq; 117 * After flush data completion, @rq->bio is %NULL but we need to
40 118 * complete the bio again. @rq->biotail is guaranteed to equal the
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { 119 * original @rq->bio. Restore it.
42 /* not complete yet, queue the next flush sequence */ 120 */
43 next_rq = queue_next_fseq(q); 121 rq->bio = rq->biotail;
44 } else { 122
45 /* complete this flush request */ 123 /* make @rq a normal request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err); 124 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
47 q->orig_flush_rq = NULL; 125 rq->end_io = NULL;
48 q->flush_seq = 0; 126}
49 127
50 /* dispatch the next flush if there's one */ 128/**
51 if (!list_empty(&q->pending_flushes)) { 129 * blk_flush_complete_seq - complete flush sequence
52 next_rq = list_entry_rq(q->pending_flushes.next); 130 * @rq: FLUSH/FUA request being sequenced
53 list_move(&next_rq->queuelist, &q->queue_head); 131 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
54 } 132 * @error: whether an error occurred
133 *
134 * @rq just completed @seq part of its flush sequence, record the
135 * completion and trigger the next step.
136 *
137 * CONTEXT:
138 * spin_lock_irq(q->queue_lock)
139 *
140 * RETURNS:
141 * %true if requests were added to the dispatch queue, %false otherwise.
142 */
143static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
144 int error)
145{
146 struct request_queue *q = rq->q;
147 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
148 bool queued = false;
149
150 BUG_ON(rq->flush.seq & seq);
151 rq->flush.seq |= seq;
152
153 if (likely(!error))
154 seq = blk_flush_cur_seq(rq);
155 else
156 seq = REQ_FSEQ_DONE;
157
158 switch (seq) {
159 case REQ_FSEQ_PREFLUSH:
160 case REQ_FSEQ_POSTFLUSH:
161 /* queue for flush */
162 if (list_empty(pending))
163 q->flush_pending_since = jiffies;
164 list_move_tail(&rq->flush.list, pending);
165 break;
166
167 case REQ_FSEQ_DATA:
168 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
169 list_add(&rq->queuelist, &q->queue_head);
170 queued = true;
171 break;
172
173 case REQ_FSEQ_DONE:
174 /*
175 * @rq was previously adjusted by blk_flush_issue() for
176 * flush sequencing and may already have gone through the
177 * flush data request completion path. Restore @rq for
178 * normal completion and end it.
179 */
180 BUG_ON(!list_empty(&rq->queuelist));
181 list_del_init(&rq->flush.list);
182 blk_flush_restore_request(rq);
183 __blk_end_request_all(rq, error);
184 break;
185
186 default:
187 BUG();
55 } 188 }
56 return next_rq; 189
190 return blk_kick_flush(q) | queued;
57} 191}
58 192
59static void blk_flush_complete_seq_end_io(struct request_queue *q, 193static void flush_end_io(struct request *flush_rq, int error)
60 unsigned seq, int error)
61{ 194{
62 bool was_empty = elv_queue_empty(q); 195 struct request_queue *q = flush_rq->q;
63 struct request *next_rq; 196 struct list_head *running = &q->flush_queue[q->flush_running_idx];
197 bool queued = false;
198 struct request *rq, *n;
64 199
65 next_rq = blk_flush_complete_seq(q, seq, error); 200 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
201
202 /* account completion of the flush request */
203 q->flush_running_idx ^= 1;
204 elv_completed_request(q, flush_rq);
205
206 /* and push the waiting requests to the next stage */
207 list_for_each_entry_safe(rq, n, running, flush.list) {
208 unsigned int seq = blk_flush_cur_seq(rq);
209
210 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
211 queued |= blk_flush_complete_seq(rq, seq, error);
212 }
66 213
67 /* 214 /*
68 * Moving a request silently to empty queue_head may stall the 215 * Moving a request silently to empty queue_head may stall the
@@ -70,127 +217,153 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
70 * from request completion path and calling directly into 217 * from request completion path and calling directly into
71 * request_fn may confuse the driver. Always use kblockd. 218 * request_fn may confuse the driver. Always use kblockd.
72 */ 219 */
73 if (was_empty && next_rq) 220 if (queued)
74 __blk_run_queue(q, true); 221 __blk_run_queue(q, true);
75} 222}
76 223
77static void pre_flush_end_io(struct request *rq, int error) 224/**
225 * blk_kick_flush - consider issuing flush request
226 * @q: request_queue being kicked
227 *
228 * Flush related states of @q have changed, consider issuing flush request.
229 * Please read the comment at the top of this file for more info.
230 *
231 * CONTEXT:
232 * spin_lock_irq(q->queue_lock)
233 *
234 * RETURNS:
235 * %true if flush was issued, %false otherwise.
236 */
237static bool blk_kick_flush(struct request_queue *q)
78{ 238{
79 elv_completed_request(rq->q, rq); 239 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
80 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); 240 struct request *first_rq =
241 list_first_entry(pending, struct request, flush.list);
242
243 /* C1 described at the top of this file */
244 if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
245 return false;
246
247 /* C2 and C3 */
248 if (!list_empty(&q->flush_data_in_flight) &&
249 time_before(jiffies,
250 q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
251 return false;
252
253 /*
254 * Issue flush and toggle pending_idx. This makes pending_idx
255 * different from running_idx, which means flush is in flight.
256 */
257 blk_rq_init(q, &q->flush_rq);
258 q->flush_rq.cmd_type = REQ_TYPE_FS;
259 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
260 q->flush_rq.rq_disk = first_rq->rq_disk;
261 q->flush_rq.end_io = flush_end_io;
262
263 q->flush_pending_idx ^= 1;
264 elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE);
265 return true;
81} 266}
82 267
83static void flush_data_end_io(struct request *rq, int error) 268static void flush_data_end_io(struct request *rq, int error)
84{ 269{
85 elv_completed_request(rq->q, rq); 270 struct request_queue *q = rq->q;
86 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
87}
88 271
89static void post_flush_end_io(struct request *rq, int error) 272 /*
90{ 273 * After populating an empty queue, kick it to avoid stall. Read
91 elv_completed_request(rq->q, rq); 274 * the comment in flush_end_io().
92 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); 275 */
276 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
277 __blk_run_queue(q, true);
93} 278}
94 279
95static void init_flush_request(struct request *rq, struct gendisk *disk) 280/**
281 * blk_insert_flush - insert a new FLUSH/FUA request
282 * @rq: request to insert
283 *
284 * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
285 * @rq is being submitted. Analyze what needs to be done and put it on the
286 * right queue.
287 *
288 * CONTEXT:
289 * spin_lock_irq(q->queue_lock)
290 */
291void blk_insert_flush(struct request *rq)
96{ 292{
97 rq->cmd_type = REQ_TYPE_FS; 293 struct request_queue *q = rq->q;
98 rq->cmd_flags = WRITE_FLUSH; 294 unsigned int fflags = q->flush_flags; /* may change, cache */
99 rq->rq_disk = disk; 295 unsigned int policy = blk_flush_policy(fflags, rq);
100}
101 296
102static struct request *queue_next_fseq(struct request_queue *q) 297 BUG_ON(rq->end_io);
103{ 298 BUG_ON(!rq->bio || rq->bio != rq->biotail);
104 struct request *orig_rq = q->orig_flush_rq;
105 struct request *rq = &q->flush_rq;
106 299
107 blk_rq_init(q, rq); 300 /*
301 * @policy now records what operations need to be done. Adjust
302 * REQ_FLUSH and FUA for the driver.
303 */
304 rq->cmd_flags &= ~REQ_FLUSH;
305 if (!(fflags & REQ_FUA))
306 rq->cmd_flags &= ~REQ_FUA;
108 307
109 switch (blk_flush_cur_seq(q)) { 308 /*
110 case QUEUE_FSEQ_PREFLUSH: 309 * If there's data but flush is not necessary, the request can be
111 init_flush_request(rq, orig_rq->rq_disk); 310 * processed directly without going through flush machinery. Queue
112 rq->end_io = pre_flush_end_io; 311 * for normal execution.
113 break; 312 */
114 case QUEUE_FSEQ_DATA: 313 if ((policy & REQ_FSEQ_DATA) &&
115 init_request_from_bio(rq, orig_rq->bio); 314 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
116 /* 315 list_add(&rq->queuelist, &q->queue_head);
117 * orig_rq->rq_disk may be different from 316 return;
118 * bio->bi_bdev->bd_disk if orig_rq got here through
119 * remapping drivers. Make sure rq->rq_disk points
120 * to the same one as orig_rq.
121 */
122 rq->rq_disk = orig_rq->rq_disk;
123 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
124 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
125 rq->end_io = flush_data_end_io;
126 break;
127 case QUEUE_FSEQ_POSTFLUSH:
128 init_flush_request(rq, orig_rq->rq_disk);
129 rq->end_io = post_flush_end_io;
130 break;
131 default:
132 BUG();
133 } 317 }
134 318
135 elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); 319 /*
136 return rq; 320 * @rq should go through flush machinery. Mark it part of flush
321 * sequence and submit for further processing.
322 */
323 memset(&rq->flush, 0, sizeof(rq->flush));
324 INIT_LIST_HEAD(&rq->flush.list);
325 rq->cmd_flags |= REQ_FLUSH_SEQ;
326 rq->end_io = flush_data_end_io;
327
328 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
137} 329}
138 330
139struct request *blk_do_flush(struct request_queue *q, struct request *rq) 331/**
332 * blk_abort_flushes - @q is being aborted, abort flush requests
333 * @q: request_queue being aborted
334 *
335 * To be called from elv_abort_queue(). @q is being aborted. Prepare all
336 * FLUSH/FUA requests for abortion.
337 *
338 * CONTEXT:
339 * spin_lock_irq(q->queue_lock)
340 */
341void blk_abort_flushes(struct request_queue *q)
140{ 342{
141 unsigned int fflags = q->flush_flags; /* may change, cache it */ 343 struct request *rq, *n;
142 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; 344 int i;
143 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
144 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
145 unsigned skip = 0;
146 345
147 /* 346 /*
148 * Special case. If there's data but flush is not necessary, 347 * Requests in flight for data are already owned by the dispatch
149 * the request can be issued directly. 348 * queue or the device driver. Just restore for normal completion.
150 *
151 * Flush w/o data should be able to be issued directly too but
152 * currently some drivers assume that rq->bio contains
153 * non-zero data if it isn't NULL and empty FLUSH requests
154 * getting here usually have bio's without data.
155 */ 349 */
156 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { 350 list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
157 rq->cmd_flags &= ~REQ_FLUSH; 351 list_del_init(&rq->flush.list);
158 if (!has_fua) 352 blk_flush_restore_request(rq);
159 rq->cmd_flags &= ~REQ_FUA;
160 return rq;
161 } 353 }
162 354
163 /* 355 /*
164 * Sequenced flushes can't be processed in parallel. If 356 * We need to give away requests on flush queues. Restore for
165 * another one is already in progress, queue for later 357 * normal completion and put them on the dispatch queue.
166 * processing.
167 */ 358 */
168 if (q->flush_seq) { 359 for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
169 list_move_tail(&rq->queuelist, &q->pending_flushes); 360 list_for_each_entry_safe(rq, n, &q->flush_queue[i],
170 return NULL; 361 flush.list) {
362 list_del_init(&rq->flush.list);
363 blk_flush_restore_request(rq);
364 list_add_tail(&rq->queuelist, &q->queue_head);
365 }
171 } 366 }
172
173 /*
174 * Start a new flush sequence
175 */
176 q->flush_err = 0;
177 q->flush_seq |= QUEUE_FSEQ_STARTED;
178
179 /* adjust FLUSH/FUA of the original request and stash it away */
180 rq->cmd_flags &= ~REQ_FLUSH;
181 if (!has_fua)
182 rq->cmd_flags &= ~REQ_FUA;
183 blk_dequeue_request(rq);
184 q->orig_flush_rq = rq;
185
186 /* skip unneded sequences and return the first one */
187 if (!do_preflush)
188 skip |= QUEUE_FSEQ_PREFLUSH;
189 if (!blk_rq_sectors(rq))
190 skip |= QUEUE_FSEQ_DATA;
191 if (!do_postflush)
192 skip |= QUEUE_FSEQ_POSTFLUSH;
193 return blk_flush_complete_seq(q, skip, 0);
194} 367}
195 368
196static void bio_end_flush(struct bio *bio, int err) 369static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index bd3e8df4d5e2..25de73e4759b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -136,8 +136,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
136 * 136 *
137 * Description: 137 * Description:
138 * Generate and issue number of bios with zerofiled pages. 138 * Generate and issue number of bios with zerofiled pages.
139 * Send barrier at the beginning and at the end if requested. This guarantie
140 * correct request ordering. Empty barrier allow us to avoid post queue flush.
141 */ 139 */
142 140
143int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 141int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20d5e94..cfcc37cb222b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
465 465
466 return 0; 466 return 0;
467} 467}
468
469int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
470 struct request *next)
471{
472 return attempt_merge(q, rq, next);
473}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af18..1fa769293597 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -164,25 +164,10 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
164 blk_queue_congestion_threshold(q); 164 blk_queue_congestion_threshold(q);
165 q->nr_batching = BLK_BATCH_REQ; 165 q->nr_batching = BLK_BATCH_REQ;
166 166
167 q->unplug_thresh = 4; /* hmm */
168 q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
169 if (q->unplug_delay == 0)
170 q->unplug_delay = 1;
171
172 q->unplug_timer.function = blk_unplug_timeout;
173 q->unplug_timer.data = (unsigned long)q;
174
175 blk_set_default_limits(&q->limits); 167 blk_set_default_limits(&q->limits);
176 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); 168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
177 169
178 /* 170 /*
179 * If the caller didn't supply a lock, fall back to our embedded
180 * per-queue locks
181 */
182 if (!q->queue_lock)
183 q->queue_lock = &q->__queue_lock;
184
185 /*
186 * by default assume old behaviour and bounce for any highmem page 171 * by default assume old behaviour and bounce for any highmem page
187 */ 172 */
188 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 173 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 41fb69150b4d..261c75c665ae 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj)
471 471
472 blk_sync_queue(q); 472 blk_sync_queue(q);
473 473
474 blk_throtl_exit(q);
475
476 if (rl->rq_pool) 474 if (rl->rq_pool)
477 mempool_destroy(rl->rq_pool); 475 mempool_destroy(rl->rq_pool);
478 476
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e36cc10a346c..5352bdafbcf0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -102,7 +102,7 @@ struct throtl_data
102 /* Work for dispatching throttled bios */ 102 /* Work for dispatching throttled bios */
103 struct delayed_work throtl_work; 103 struct delayed_work throtl_work;
104 104
105 atomic_t limits_changed; 105 bool limits_changed;
106}; 106};
107 107
108enum tg_state_flags { 108enum tg_state_flags {
@@ -201,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
201 RB_CLEAR_NODE(&tg->rb_node); 201 RB_CLEAR_NODE(&tg->rb_node);
202 bio_list_init(&tg->bio_lists[0]); 202 bio_list_init(&tg->bio_lists[0]);
203 bio_list_init(&tg->bio_lists[1]); 203 bio_list_init(&tg->bio_lists[1]);
204 td->limits_changed = false;
204 205
205 /* 206 /*
206 * Take the initial reference that will be released on destroy 207 * Take the initial reference that will be released on destroy
@@ -737,34 +738,36 @@ static void throtl_process_limit_change(struct throtl_data *td)
737 struct throtl_grp *tg; 738 struct throtl_grp *tg;
738 struct hlist_node *pos, *n; 739 struct hlist_node *pos, *n;
739 740
740 if (!atomic_read(&td->limits_changed)) 741 if (!td->limits_changed)
741 return; 742 return;
742 743
743 throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); 744 xchg(&td->limits_changed, false);
744 745
745 /* 746 throtl_log(td, "limits changed");
746 * Make sure updates from throtl_update_blkio_group_read_bps() group
747 * of functions to tg->limits_changed are visible. We do not
748 * want update td->limits_changed to be visible but update to
749 * tg->limits_changed not being visible yet on this cpu. Hence
750 * the read barrier.
751 */
752 smp_rmb();
753 747
754 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 748 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
755 if (throtl_tg_on_rr(tg) && tg->limits_changed) { 749 if (!tg->limits_changed)
756 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 750 continue;
757 " riops=%u wiops=%u", tg->bps[READ], 751
758 tg->bps[WRITE], tg->iops[READ], 752 if (!xchg(&tg->limits_changed, false))
759 tg->iops[WRITE]); 753 continue;
754
755 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
756 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
757 tg->iops[READ], tg->iops[WRITE]);
758
759 /*
760 * Restart the slices for both READ and WRITES. It
761 * might happen that a group's limit are dropped
762 * suddenly and we don't want to account recently
763 * dispatched IO with new low rate
764 */
765 throtl_start_new_slice(td, tg, 0);
766 throtl_start_new_slice(td, tg, 1);
767
768 if (throtl_tg_on_rr(tg))
760 tg_update_disptime(td, tg); 769 tg_update_disptime(td, tg);
761 tg->limits_changed = false;
762 }
763 } 770 }
764
765 smp_mb__before_atomic_dec();
766 atomic_dec(&td->limits_changed);
767 smp_mb__after_atomic_dec();
768} 771}
769 772
770/* Dispatch throttled bios. Should be called without queue lock held. */ 773/* Dispatch throttled bios. Should be called without queue lock held. */
@@ -774,6 +777,7 @@ static int throtl_dispatch(struct request_queue *q)
774 unsigned int nr_disp = 0; 777 unsigned int nr_disp = 0;
775 struct bio_list bio_list_on_stack; 778 struct bio_list bio_list_on_stack;
776 struct bio *bio; 779 struct bio *bio;
780 struct blk_plug plug;
777 781
778 spin_lock_irq(q->queue_lock); 782 spin_lock_irq(q->queue_lock);
779 783
@@ -802,9 +806,10 @@ out:
802 * immediate dispatch 806 * immediate dispatch
803 */ 807 */
804 if (nr_disp) { 808 if (nr_disp) {
809 blk_start_plug(&plug);
805 while((bio = bio_list_pop(&bio_list_on_stack))) 810 while((bio = bio_list_pop(&bio_list_on_stack)))
806 generic_make_request(bio); 811 generic_make_request(bio);
807 blk_unplug(q); 812 blk_finish_plug(&plug);
808 } 813 }
809 return nr_disp; 814 return nr_disp;
810} 815}
@@ -825,7 +830,8 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
825 830
826 struct delayed_work *dwork = &td->throtl_work; 831 struct delayed_work *dwork = &td->throtl_work;
827 832
828 if (total_nr_queued(td) > 0) { 833 /* schedule work if limits changed even if no bio is queued */
834 if (total_nr_queued(td) > 0 || td->limits_changed) {
829 /* 835 /*
830 * We might have a work scheduled to be executed in future. 836 * We might have a work scheduled to be executed in future.
831 * Cancel that and schedule a new one. 837 * Cancel that and schedule a new one.
@@ -898,6 +904,15 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
898 spin_unlock_irqrestore(td->queue->queue_lock, flags); 904 spin_unlock_irqrestore(td->queue->queue_lock, flags);
899} 905}
900 906
907static void throtl_update_blkio_group_common(struct throtl_data *td,
908 struct throtl_grp *tg)
909{
910 xchg(&tg->limits_changed, true);
911 xchg(&td->limits_changed, true);
912 /* Schedule a work now to process the limit change */
913 throtl_schedule_delayed_work(td, 0);
914}
915
901/* 916/*
902 * For all update functions, key should be a valid pointer because these 917 * For all update functions, key should be a valid pointer because these
903 * update functions are called under blkcg_lock, that means, blkg is 918 * update functions are called under blkcg_lock, that means, blkg is
@@ -911,64 +926,43 @@ static void throtl_update_blkio_group_read_bps(void *key,
911 struct blkio_group *blkg, u64 read_bps) 926 struct blkio_group *blkg, u64 read_bps)
912{ 927{
913 struct throtl_data *td = key; 928 struct throtl_data *td = key;
929 struct throtl_grp *tg = tg_of_blkg(blkg);
914 930
915 tg_of_blkg(blkg)->bps[READ] = read_bps; 931 tg->bps[READ] = read_bps;
916 /* Make sure read_bps is updated before setting limits_changed */ 932 throtl_update_blkio_group_common(td, tg);
917 smp_wmb();
918 tg_of_blkg(blkg)->limits_changed = true;
919
920 /* Make sure tg->limits_changed is updated before td->limits_changed */
921 smp_mb__before_atomic_inc();
922 atomic_inc(&td->limits_changed);
923 smp_mb__after_atomic_inc();
924
925 /* Schedule a work now to process the limit change */
926 throtl_schedule_delayed_work(td, 0);
927} 933}
928 934
929static void throtl_update_blkio_group_write_bps(void *key, 935static void throtl_update_blkio_group_write_bps(void *key,
930 struct blkio_group *blkg, u64 write_bps) 936 struct blkio_group *blkg, u64 write_bps)
931{ 937{
932 struct throtl_data *td = key; 938 struct throtl_data *td = key;
939 struct throtl_grp *tg = tg_of_blkg(blkg);
933 940
934 tg_of_blkg(blkg)->bps[WRITE] = write_bps; 941 tg->bps[WRITE] = write_bps;
935 smp_wmb(); 942 throtl_update_blkio_group_common(td, tg);
936 tg_of_blkg(blkg)->limits_changed = true;
937 smp_mb__before_atomic_inc();
938 atomic_inc(&td->limits_changed);
939 smp_mb__after_atomic_inc();
940 throtl_schedule_delayed_work(td, 0);
941} 943}
942 944
943static void throtl_update_blkio_group_read_iops(void *key, 945static void throtl_update_blkio_group_read_iops(void *key,
944 struct blkio_group *blkg, unsigned int read_iops) 946 struct blkio_group *blkg, unsigned int read_iops)
945{ 947{
946 struct throtl_data *td = key; 948 struct throtl_data *td = key;
949 struct throtl_grp *tg = tg_of_blkg(blkg);
947 950
948 tg_of_blkg(blkg)->iops[READ] = read_iops; 951 tg->iops[READ] = read_iops;
949 smp_wmb(); 952 throtl_update_blkio_group_common(td, tg);
950 tg_of_blkg(blkg)->limits_changed = true;
951 smp_mb__before_atomic_inc();
952 atomic_inc(&td->limits_changed);
953 smp_mb__after_atomic_inc();
954 throtl_schedule_delayed_work(td, 0);
955} 953}
956 954
957static void throtl_update_blkio_group_write_iops(void *key, 955static void throtl_update_blkio_group_write_iops(void *key,
958 struct blkio_group *blkg, unsigned int write_iops) 956 struct blkio_group *blkg, unsigned int write_iops)
959{ 957{
960 struct throtl_data *td = key; 958 struct throtl_data *td = key;
959 struct throtl_grp *tg = tg_of_blkg(blkg);
961 960
962 tg_of_blkg(blkg)->iops[WRITE] = write_iops; 961 tg->iops[WRITE] = write_iops;
963 smp_wmb(); 962 throtl_update_blkio_group_common(td, tg);
964 tg_of_blkg(blkg)->limits_changed = true;
965 smp_mb__before_atomic_inc();
966 atomic_inc(&td->limits_changed);
967 smp_mb__after_atomic_inc();
968 throtl_schedule_delayed_work(td, 0);
969} 963}
970 964
971void throtl_shutdown_timer_wq(struct request_queue *q) 965static void throtl_shutdown_wq(struct request_queue *q)
972{ 966{
973 struct throtl_data *td = q->td; 967 struct throtl_data *td = q->td;
974 968
@@ -1009,20 +1003,28 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
1009 /* 1003 /*
1010 * There is already another bio queued in same dir. No 1004 * There is already another bio queued in same dir. No
1011 * need to update dispatch time. 1005 * need to update dispatch time.
1012 * Still update the disptime if rate limits on this group
1013 * were changed.
1014 */ 1006 */
1015 if (!tg->limits_changed) 1007 update_disptime = false;
1016 update_disptime = false;
1017 else
1018 tg->limits_changed = false;
1019
1020 goto queue_bio; 1008 goto queue_bio;
1009
1021 } 1010 }
1022 1011
1023 /* Bio is with-in rate limit of group */ 1012 /* Bio is with-in rate limit of group */
1024 if (tg_may_dispatch(td, tg, bio, NULL)) { 1013 if (tg_may_dispatch(td, tg, bio, NULL)) {
1025 throtl_charge_bio(tg, bio); 1014 throtl_charge_bio(tg, bio);
1015
1016 /*
1017 * We need to trim slice even when bios are not being queued
1018 * otherwise it might happen that a bio is not queued for
1019 * a long time and slice keeps on extending and trim is not
1020 * called for a long time. Now if limits are reduced suddenly
1021 * we take into account all the IO dispatched so far at new
1022 * low rate and * newly queued IO gets a really long dispatch
1023 * time.
1024 *
1025 * So keep on trimming slice even if bio is not queued.
1026 */
1027 throtl_trim_slice(td, tg, rw);
1026 goto out; 1028 goto out;
1027 } 1029 }
1028 1030
@@ -1058,7 +1060,7 @@ int blk_throtl_init(struct request_queue *q)
1058 1060
1059 INIT_HLIST_HEAD(&td->tg_list); 1061 INIT_HLIST_HEAD(&td->tg_list);
1060 td->tg_service_tree = THROTL_RB_ROOT; 1062 td->tg_service_tree = THROTL_RB_ROOT;
1061 atomic_set(&td->limits_changed, 0); 1063 td->limits_changed = false;
1062 1064
1063 /* Init root group */ 1065 /* Init root group */
1064 tg = &td->root_tg; 1066 tg = &td->root_tg;
@@ -1070,6 +1072,7 @@ int blk_throtl_init(struct request_queue *q)
1070 /* Practically unlimited BW */ 1072 /* Practically unlimited BW */
1071 tg->bps[0] = tg->bps[1] = -1; 1073 tg->bps[0] = tg->bps[1] = -1;
1072 tg->iops[0] = tg->iops[1] = -1; 1074 tg->iops[0] = tg->iops[1] = -1;
1075 td->limits_changed = false;
1073 1076
1074 /* 1077 /*
1075 * Set root group reference to 2. One reference will be dropped when 1078 * Set root group reference to 2. One reference will be dropped when
@@ -1102,7 +1105,7 @@ void blk_throtl_exit(struct request_queue *q)
1102 1105
1103 BUG_ON(!td); 1106 BUG_ON(!td);
1104 1107
1105 throtl_shutdown_timer_wq(q); 1108 throtl_shutdown_wq(q);
1106 1109
1107 spin_lock_irq(q->queue_lock); 1110 spin_lock_irq(q->queue_lock);
1108 throtl_release_tgs(td); 1111 throtl_release_tgs(td);
@@ -1132,7 +1135,7 @@ void blk_throtl_exit(struct request_queue *q)
1132 * update limits through cgroup and another work got queued, cancel 1135 * update limits through cgroup and another work got queued, cancel
1133 * it. 1136 * it.
1134 */ 1137 */
1135 throtl_shutdown_timer_wq(q); 1138 throtl_shutdown_wq(q);
1136 throtl_td_free(td); 1139 throtl_td_free(td);
1137} 1140}
1138 1141
diff --git a/block/blk.h b/block/blk.h
index 2db8f32838e7..c8db371a921d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
18void blk_dequeue_request(struct request *rq); 18void blk_dequeue_request(struct request *rq);
19void __blk_queue_free_tags(struct request_queue *q); 19void __blk_queue_free_tags(struct request_queue *q);
20 20
21void blk_unplug_work(struct work_struct *work);
22void blk_unplug_timeout(unsigned long data);
23void blk_rq_timed_out_timer(unsigned long data); 21void blk_rq_timed_out_timer(unsigned long data);
24void blk_delete_timer(struct request *); 22void blk_delete_timer(struct request *);
25void blk_add_timer(struct request *); 23void blk_add_timer(struct request *);
@@ -51,21 +49,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 49 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 50#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 51
54struct request *blk_do_flush(struct request_queue *q, struct request *rq); 52void blk_insert_flush(struct request *rq);
53void blk_abort_flushes(struct request_queue *q);
55 54
56static inline struct request *__elv_next_request(struct request_queue *q) 55static inline struct request *__elv_next_request(struct request_queue *q)
57{ 56{
58 struct request *rq; 57 struct request *rq;
59 58
60 while (1) { 59 while (1) {
61 while (!list_empty(&q->queue_head)) { 60 if (!list_empty(&q->queue_head)) {
62 rq = list_entry_rq(q->queue_head.next); 61 rq = list_entry_rq(q->queue_head.next);
63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || 62 return rq;
64 rq == &q->flush_rq)
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
68 return rq;
69 } 63 }
70 64
71 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 65 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
@@ -109,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
109 struct bio *bio); 103 struct bio *bio);
110int attempt_back_merge(struct request_queue *q, struct request *rq); 104int attempt_back_merge(struct request_queue *q, struct request *rq);
111int attempt_front_merge(struct request_queue *q, struct request *rq); 105int attempt_front_merge(struct request_queue *q, struct request *rq);
106int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
107 struct request *next);
112void blk_recalc_rq_segments(struct request *rq); 108void blk_recalc_rq_segments(struct request *rq);
113void blk_rq_set_mixed_merge(struct request *rq); 109void blk_rq_set_mixed_merge(struct request *rq);
114 110
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ea83a4f0c27d..7785169f3c8f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
55 55
56#define RQ_CIC(rq) \ 56#define RQ_CIC(rq) \
57 ((struct cfq_io_context *) (rq)->elevator_private) 57 ((struct cfq_io_context *) (rq)->elevator_private[0])
58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
60 60
61static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool; 62static struct kmem_cache *cfq_ioc_pool;
@@ -146,7 +146,6 @@ struct cfq_queue {
146 struct cfq_rb_root *service_tree; 146 struct cfq_rb_root *service_tree;
147 struct cfq_queue *new_cfqq; 147 struct cfq_queue *new_cfqq;
148 struct cfq_group *cfqg; 148 struct cfq_group *cfqg;
149 struct cfq_group *orig_cfqg;
150 /* Number of sectors dispatched from queue in single dispatch round */ 149 /* Number of sectors dispatched from queue in single dispatch round */
151 unsigned long nr_sectors; 150 unsigned long nr_sectors;
152}; 151};
@@ -179,6 +178,8 @@ struct cfq_group {
179 /* group service_tree key */ 178 /* group service_tree key */
180 u64 vdisktime; 179 u64 vdisktime;
181 unsigned int weight; 180 unsigned int weight;
181 unsigned int new_weight;
182 bool needs_update;
182 183
183 /* number of cfqq currently on this group */ 184 /* number of cfqq currently on this group */
184 int nr_cfqq; 185 int nr_cfqq;
@@ -238,6 +239,7 @@ struct cfq_data {
238 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 239 struct rb_root prio_trees[CFQ_PRIO_LISTS];
239 240
240 unsigned int busy_queues; 241 unsigned int busy_queues;
242 unsigned int busy_sync_queues;
241 243
242 int rq_in_driver; 244 int rq_in_driver;
243 int rq_in_flight[2]; 245 int rq_in_flight[2];
@@ -285,7 +287,6 @@ struct cfq_data {
285 unsigned int cfq_slice_idle; 287 unsigned int cfq_slice_idle;
286 unsigned int cfq_group_idle; 288 unsigned int cfq_group_idle;
287 unsigned int cfq_latency; 289 unsigned int cfq_latency;
288 unsigned int cfq_group_isolation;
289 290
290 unsigned int cic_index; 291 unsigned int cic_index;
291 struct list_head cic_list; 292 struct list_head cic_list;
@@ -501,13 +502,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
501 } 502 }
502} 503}
503 504
504static int cfq_queue_empty(struct request_queue *q)
505{
506 struct cfq_data *cfqd = q->elevator->elevator_data;
507
508 return !cfqd->rq_queued;
509}
510
511/* 505/*
512 * Scale schedule slice based on io priority. Use the sync time slice only 506 * Scale schedule slice based on io priority. Use the sync time slice only
513 * if a queue is marked sync and has sync io queued. A sync queue with async 507 * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -558,15 +552,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
558 552
559static void update_min_vdisktime(struct cfq_rb_root *st) 553static void update_min_vdisktime(struct cfq_rb_root *st)
560{ 554{
561 u64 vdisktime = st->min_vdisktime;
562 struct cfq_group *cfqg; 555 struct cfq_group *cfqg;
563 556
564 if (st->left) { 557 if (st->left) {
565 cfqg = rb_entry_cfqg(st->left); 558 cfqg = rb_entry_cfqg(st->left);
566 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 559 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
560 cfqg->vdisktime);
567 } 561 }
568
569 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
570} 562}
571 563
572/* 564/*
@@ -863,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
863} 855}
864 856
865static void 857static void
866cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 858cfq_update_group_weight(struct cfq_group *cfqg)
859{
860 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
861 if (cfqg->needs_update) {
862 cfqg->weight = cfqg->new_weight;
863 cfqg->needs_update = false;
864 }
865}
866
867static void
868cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
869{
870 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
871
872 cfq_update_group_weight(cfqg);
873 __cfq_group_service_tree_add(st, cfqg);
874 st->total_weight += cfqg->weight;
875}
876
877static void
878cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
867{ 879{
868 struct cfq_rb_root *st = &cfqd->grp_service_tree; 880 struct cfq_rb_root *st = &cfqd->grp_service_tree;
869 struct cfq_group *__cfqg; 881 struct cfq_group *__cfqg;
@@ -884,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
884 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 896 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
885 } else 897 } else
886 cfqg->vdisktime = st->min_vdisktime; 898 cfqg->vdisktime = st->min_vdisktime;
899 cfq_group_service_tree_add(st, cfqg);
900}
887 901
888 __cfq_group_service_tree_add(st, cfqg); 902static void
889 st->total_weight += cfqg->weight; 903cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
904{
905 st->total_weight -= cfqg->weight;
906 if (!RB_EMPTY_NODE(&cfqg->rb_node))
907 cfq_rb_erase(&cfqg->rb_node, st);
890} 908}
891 909
892static void 910static void
893cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 911cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
894{ 912{
895 struct cfq_rb_root *st = &cfqd->grp_service_tree; 913 struct cfq_rb_root *st = &cfqd->grp_service_tree;
896 914
@@ -902,14 +920,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
902 return; 920 return;
903 921
904 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 922 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
905 st->total_weight -= cfqg->weight; 923 cfq_group_service_tree_del(st, cfqg);
906 if (!RB_EMPTY_NODE(&cfqg->rb_node))
907 cfq_rb_erase(&cfqg->rb_node, st);
908 cfqg->saved_workload_slice = 0; 924 cfqg->saved_workload_slice = 0;
909 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 925 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
910} 926}
911 927
912static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 928static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
929 unsigned int *unaccounted_time)
913{ 930{
914 unsigned int slice_used; 931 unsigned int slice_used;
915 932
@@ -928,8 +945,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
928 1); 945 1);
929 } else { 946 } else {
930 slice_used = jiffies - cfqq->slice_start; 947 slice_used = jiffies - cfqq->slice_start;
931 if (slice_used > cfqq->allocated_slice) 948 if (slice_used > cfqq->allocated_slice) {
949 *unaccounted_time = slice_used - cfqq->allocated_slice;
932 slice_used = cfqq->allocated_slice; 950 slice_used = cfqq->allocated_slice;
951 }
952 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
953 *unaccounted_time += cfqq->slice_start -
954 cfqq->dispatch_start;
933 } 955 }
934 956
935 return slice_used; 957 return slice_used;
@@ -939,12 +961,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
939 struct cfq_queue *cfqq) 961 struct cfq_queue *cfqq)
940{ 962{
941 struct cfq_rb_root *st = &cfqd->grp_service_tree; 963 struct cfq_rb_root *st = &cfqd->grp_service_tree;
942 unsigned int used_sl, charge; 964 unsigned int used_sl, charge, unaccounted_sl = 0;
943 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 965 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
944 - cfqg->service_tree_idle.count; 966 - cfqg->service_tree_idle.count;
945 967
946 BUG_ON(nr_sync < 0); 968 BUG_ON(nr_sync < 0);
947 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 969 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
948 970
949 if (iops_mode(cfqd)) 971 if (iops_mode(cfqd))
950 charge = cfqq->slice_dispatch; 972 charge = cfqq->slice_dispatch;
@@ -952,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
952 charge = cfqq->allocated_slice; 974 charge = cfqq->allocated_slice;
953 975
954 /* Can't update vdisktime while group is on service tree */ 976 /* Can't update vdisktime while group is on service tree */
955 cfq_rb_erase(&cfqg->rb_node, st); 977 cfq_group_service_tree_del(st, cfqg);
956 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 978 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
957 __cfq_group_service_tree_add(st, cfqg); 979 /* If a new weight was requested, update now, off tree */
980 cfq_group_service_tree_add(st, cfqg);
958 981
959 /* This group is being expired. Save the context */ 982 /* This group is being expired. Save the context */
960 if (time_after(cfqd->workload_expires, jiffies)) { 983 if (time_after(cfqd->workload_expires, jiffies)) {
@@ -970,7 +993,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
970 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 993 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
971 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 994 " sect=%u", used_sl, cfqq->slice_dispatch, charge,
972 iops_mode(cfqd), cfqq->nr_sectors); 995 iops_mode(cfqd), cfqq->nr_sectors);
973 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 996 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
997 unaccounted_sl);
974 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 998 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
975} 999}
976 1000
@@ -985,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
985void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, 1009void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
986 unsigned int weight) 1010 unsigned int weight)
987{ 1011{
988 cfqg_of_blkg(blkg)->weight = weight; 1012 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1013 cfqg->new_weight = weight;
1014 cfqg->needs_update = true;
989} 1015}
990 1016
991static struct cfq_group * 1017static struct cfq_group *
@@ -1187,32 +1213,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1187 int new_cfqq = 1; 1213 int new_cfqq = 1;
1188 int group_changed = 0; 1214 int group_changed = 0;
1189 1215
1190#ifdef CONFIG_CFQ_GROUP_IOSCHED
1191 if (!cfqd->cfq_group_isolation
1192 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1193 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1194 /* Move this cfq to root group */
1195 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1196 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1197 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1198 cfqq->orig_cfqg = cfqq->cfqg;
1199 cfqq->cfqg = &cfqd->root_group;
1200 cfqd->root_group.ref++;
1201 group_changed = 1;
1202 } else if (!cfqd->cfq_group_isolation
1203 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1204 /* cfqq is sequential now needs to go to its original group */
1205 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1206 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1207 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1208 cfq_put_cfqg(cfqq->cfqg);
1209 cfqq->cfqg = cfqq->orig_cfqg;
1210 cfqq->orig_cfqg = NULL;
1211 group_changed = 1;
1212 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1213 }
1214#endif
1215
1216 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1216 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1217 cfqq_type(cfqq)); 1217 cfqq_type(cfqq));
1218 if (cfq_class_idle(cfqq)) { 1218 if (cfq_class_idle(cfqq)) {
@@ -1284,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1284 service_tree->count++; 1284 service_tree->count++;
1285 if ((add_front || !new_cfqq) && !group_changed) 1285 if ((add_front || !new_cfqq) && !group_changed)
1286 return; 1286 return;
1287 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1287 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1288} 1288}
1289 1289
1290static struct cfq_queue * 1290static struct cfq_queue *
@@ -1372,6 +1372,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1372 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1372 BUG_ON(cfq_cfqq_on_rr(cfqq));
1373 cfq_mark_cfqq_on_rr(cfqq); 1373 cfq_mark_cfqq_on_rr(cfqq);
1374 cfqd->busy_queues++; 1374 cfqd->busy_queues++;
1375 if (cfq_cfqq_sync(cfqq))
1376 cfqd->busy_sync_queues++;
1375 1377
1376 cfq_resort_rr_list(cfqd, cfqq); 1378 cfq_resort_rr_list(cfqd, cfqq);
1377} 1379}
@@ -1395,9 +1397,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1395 cfqq->p_root = NULL; 1397 cfqq->p_root = NULL;
1396 } 1398 }
1397 1399
1398 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1400 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1399 BUG_ON(!cfqd->busy_queues); 1401 BUG_ON(!cfqd->busy_queues);
1400 cfqd->busy_queues--; 1402 cfqd->busy_queues--;
1403 if (cfq_cfqq_sync(cfqq))
1404 cfqd->busy_sync_queues--;
1401} 1405}
1402 1406
1403/* 1407/*
@@ -2405,6 +2409,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2405 * Does this cfqq already have too much IO in flight? 2409 * Does this cfqq already have too much IO in flight?
2406 */ 2410 */
2407 if (cfqq->dispatched >= max_dispatch) { 2411 if (cfqq->dispatched >= max_dispatch) {
2412 bool promote_sync = false;
2408 /* 2413 /*
2409 * idle queue must always only have a single IO in flight 2414 * idle queue must always only have a single IO in flight
2410 */ 2415 */
@@ -2412,15 +2417,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2412 return false; 2417 return false;
2413 2418
2414 /* 2419 /*
2420 * If there is only one sync queue
2421 * we can ignore async queue here and give the sync
2422 * queue no dispatch limit. The reason is a sync queue can
2423 * preempt async queue, limiting the sync queue doesn't make
2424 * sense. This is useful for aiostress test.
2425 */
2426 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2427 promote_sync = true;
2428
2429 /*
2415 * We have other queues, don't allow more IO from this one 2430 * We have other queues, don't allow more IO from this one
2416 */ 2431 */
2417 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2432 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2433 !promote_sync)
2418 return false; 2434 return false;
2419 2435
2420 /* 2436 /*
2421 * Sole queue user, no limit 2437 * Sole queue user, no limit
2422 */ 2438 */
2423 if (cfqd->busy_queues == 1) 2439 if (cfqd->busy_queues == 1 || promote_sync)
2424 max_dispatch = -1; 2440 max_dispatch = -1;
2425 else 2441 else
2426 /* 2442 /*
@@ -2542,7 +2558,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
2542static void cfq_put_queue(struct cfq_queue *cfqq) 2558static void cfq_put_queue(struct cfq_queue *cfqq)
2543{ 2559{
2544 struct cfq_data *cfqd = cfqq->cfqd; 2560 struct cfq_data *cfqd = cfqq->cfqd;
2545 struct cfq_group *cfqg, *orig_cfqg; 2561 struct cfq_group *cfqg;
2546 2562
2547 BUG_ON(cfqq->ref <= 0); 2563 BUG_ON(cfqq->ref <= 0);
2548 2564
@@ -2554,7 +2570,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2554 BUG_ON(rb_first(&cfqq->sort_list)); 2570 BUG_ON(rb_first(&cfqq->sort_list));
2555 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2571 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2556 cfqg = cfqq->cfqg; 2572 cfqg = cfqq->cfqg;
2557 orig_cfqg = cfqq->orig_cfqg;
2558 2573
2559 if (unlikely(cfqd->active_queue == cfqq)) { 2574 if (unlikely(cfqd->active_queue == cfqq)) {
2560 __cfq_slice_expired(cfqd, cfqq, 0); 2575 __cfq_slice_expired(cfqd, cfqq, 0);
@@ -2564,8 +2579,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2564 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2579 BUG_ON(cfq_cfqq_on_rr(cfqq));
2565 kmem_cache_free(cfq_pool, cfqq); 2580 kmem_cache_free(cfq_pool, cfqq);
2566 cfq_put_cfqg(cfqg); 2581 cfq_put_cfqg(cfqg);
2567 if (orig_cfqg)
2568 cfq_put_cfqg(orig_cfqg);
2569} 2582}
2570 2583
2571/* 2584/*
@@ -3613,12 +3626,12 @@ static void cfq_put_request(struct request *rq)
3613 3626
3614 put_io_context(RQ_CIC(rq)->ioc); 3627 put_io_context(RQ_CIC(rq)->ioc);
3615 3628
3616 rq->elevator_private = NULL; 3629 rq->elevator_private[0] = NULL;
3617 rq->elevator_private2 = NULL; 3630 rq->elevator_private[1] = NULL;
3618 3631
3619 /* Put down rq reference on cfqg */ 3632 /* Put down rq reference on cfqg */
3620 cfq_put_cfqg(RQ_CFQG(rq)); 3633 cfq_put_cfqg(RQ_CFQG(rq));
3621 rq->elevator_private3 = NULL; 3634 rq->elevator_private[2] = NULL;
3622 3635
3623 cfq_put_queue(cfqq); 3636 cfq_put_queue(cfqq);
3624 } 3637 }
@@ -3705,13 +3718,12 @@ new_queue:
3705 } 3718 }
3706 3719
3707 cfqq->allocated[rw]++; 3720 cfqq->allocated[rw]++;
3708 cfqq->ref++;
3709 rq->elevator_private = cic;
3710 rq->elevator_private2 = cfqq;
3711 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3712 3721
3722 cfqq->ref++;
3723 rq->elevator_private[0] = cic;
3724 rq->elevator_private[1] = cfqq;
3725 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3713 spin_unlock_irqrestore(q->queue_lock, flags); 3726 spin_unlock_irqrestore(q->queue_lock, flags);
3714
3715 return 0; 3727 return 0;
3716 3728
3717queue_fail: 3729queue_fail:
@@ -3953,7 +3965,6 @@ static void *cfq_init_queue(struct request_queue *q)
3953 cfqd->cfq_slice_idle = cfq_slice_idle; 3965 cfqd->cfq_slice_idle = cfq_slice_idle;
3954 cfqd->cfq_group_idle = cfq_group_idle; 3966 cfqd->cfq_group_idle = cfq_group_idle;
3955 cfqd->cfq_latency = 1; 3967 cfqd->cfq_latency = 1;
3956 cfqd->cfq_group_isolation = 0;
3957 cfqd->hw_tag = -1; 3968 cfqd->hw_tag = -1;
3958 /* 3969 /*
3959 * we optimistically start assuming sync ops weren't delayed in last 3970 * we optimistically start assuming sync ops weren't delayed in last
@@ -4029,7 +4040,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4029SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4040SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4030SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4041SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4031SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4042SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4032SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
4033#undef SHOW_FUNCTION 4043#undef SHOW_FUNCTION
4034 4044
4035#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4045#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -4063,7 +4073,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4063STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4073STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4064 UINT_MAX, 0); 4074 UINT_MAX, 0);
4065STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4075STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4066STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
4067#undef STORE_FUNCTION 4076#undef STORE_FUNCTION
4068 4077
4069#define CFQ_ATTR(name) \ 4078#define CFQ_ATTR(name) \
@@ -4081,7 +4090,6 @@ static struct elv_fs_entry cfq_attrs[] = {
4081 CFQ_ATTR(slice_idle), 4090 CFQ_ATTR(slice_idle),
4082 CFQ_ATTR(group_idle), 4091 CFQ_ATTR(group_idle),
4083 CFQ_ATTR(low_latency), 4092 CFQ_ATTR(low_latency),
4084 CFQ_ATTR(group_isolation),
4085 __ATTR_NULL 4093 __ATTR_NULL
4086}; 4094};
4087 4095
@@ -4096,7 +4104,6 @@ static struct elevator_type iosched_cfq = {
4096 .elevator_add_req_fn = cfq_insert_request, 4104 .elevator_add_req_fn = cfq_insert_request,
4097 .elevator_activate_req_fn = cfq_activate_request, 4105 .elevator_activate_req_fn = cfq_activate_request,
4098 .elevator_deactivate_req_fn = cfq_deactivate_request, 4106 .elevator_deactivate_req_fn = cfq_deactivate_request,
4099 .elevator_queue_empty_fn = cfq_queue_empty,
4100 .elevator_completed_req_fn = cfq_completed_request, 4107 .elevator_completed_req_fn = cfq_completed_request,
4101 .elevator_former_req_fn = elv_rb_former_request, 4108 .elevator_former_req_fn = elv_rb_former_request,
4102 .elevator_latter_req_fn = elv_rb_latter_request, 4109 .elevator_latter_req_fn = elv_rb_latter_request,
diff --git a/block/cfq.h b/block/cfq.h
index 54a6d90f8e8c..2a155927e37c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
16} 16}
17 17
18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
19 unsigned long time) 19 unsigned long time, unsigned long unaccounted_time)
20{ 20{
21 blkiocg_update_timeslice_used(blkg, time); 21 blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
22} 22}
23 23
24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) 24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
85 unsigned long dequeue) {} 85 unsigned long dequeue) {}
86 86
87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
88 unsigned long time) {} 88 unsigned long time, unsigned long unaccounted_time) {}
89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} 89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, 90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
91 bool direction, bool sync) {} 91 bool direction, bool sync) {}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b23..5139c0ea1864 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
326 return 1; 326 return 1;
327} 327}
328 328
329static int deadline_queue_empty(struct request_queue *q)
330{
331 struct deadline_data *dd = q->elevator->elevator_data;
332
333 return list_empty(&dd->fifo_list[WRITE])
334 && list_empty(&dd->fifo_list[READ]);
335}
336
337static void deadline_exit_queue(struct elevator_queue *e) 329static void deadline_exit_queue(struct elevator_queue *e)
338{ 330{
339 struct deadline_data *dd = e->elevator_data; 331 struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
445 .elevator_merge_req_fn = deadline_merged_requests, 437 .elevator_merge_req_fn = deadline_merged_requests,
446 .elevator_dispatch_fn = deadline_dispatch_requests, 438 .elevator_dispatch_fn = deadline_dispatch_requests,
447 .elevator_add_req_fn = deadline_add_request, 439 .elevator_add_req_fn = deadline_add_request,
448 .elevator_queue_empty_fn = deadline_queue_empty,
449 .elevator_former_req_fn = elv_rb_former_request, 440 .elevator_former_req_fn = elv_rb_former_request,
450 .elevator_latter_req_fn = elv_rb_latter_request, 441 .elevator_latter_req_fn = elv_rb_latter_request,
451 .elevator_init_fn = deadline_init_queue, 442 .elevator_init_fn = deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 236e93c1f46c..c387d3168734 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
113} 113}
114EXPORT_SYMBOL(elv_rq_merge_ok); 114EXPORT_SYMBOL(elv_rq_merge_ok);
115 115
116static inline int elv_try_merge(struct request *__rq, struct bio *bio) 116int elv_try_merge(struct request *__rq, struct bio *bio)
117{ 117{
118 int ret = ELEVATOR_NO_MERGE; 118 int ret = ELEVATOR_NO_MERGE;
119 119
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
421 struct list_head *entry; 421 struct list_head *entry;
422 int stop_flags; 422 int stop_flags;
423 423
424 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
425
424 if (q->last_merge == rq) 426 if (q->last_merge == rq)
425 q->last_merge = NULL; 427 q->last_merge = NULL;
426 428
@@ -519,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
519 return ELEVATOR_NO_MERGE; 521 return ELEVATOR_NO_MERGE;
520} 522}
521 523
524/*
525 * Attempt to do an insertion back merge. Only check for the case where
526 * we can append 'rq' to an existing request, so we can throw 'rq' away
527 * afterwards.
528 *
529 * Returns true if we merged, false otherwise
530 */
531static bool elv_attempt_insert_merge(struct request_queue *q,
532 struct request *rq)
533{
534 struct request *__rq;
535
536 if (blk_queue_nomerges(q))
537 return false;
538
539 /*
540 * First try one-hit cache.
541 */
542 if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
543 return true;
544
545 if (blk_queue_noxmerges(q))
546 return false;
547
548 /*
549 * See if our hash lookup can find a potential backmerge.
550 */
551 __rq = elv_rqhash_find(q, blk_rq_pos(rq));
552 if (__rq && blk_attempt_req_merge(q, __rq, rq))
553 return true;
554
555 return false;
556}
557
522void elv_merged_request(struct request_queue *q, struct request *rq, int type) 558void elv_merged_request(struct request_queue *q, struct request *rq, int type)
523{ 559{
524 struct elevator_queue *e = q->elevator; 560 struct elevator_queue *e = q->elevator;
@@ -536,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
536 struct request *next) 572 struct request *next)
537{ 573{
538 struct elevator_queue *e = q->elevator; 574 struct elevator_queue *e = q->elevator;
575 const int next_sorted = next->cmd_flags & REQ_SORTED;
539 576
540 if (e->ops->elevator_merge_req_fn) 577 if (next_sorted && e->ops->elevator_merge_req_fn)
541 e->ops->elevator_merge_req_fn(q, rq, next); 578 e->ops->elevator_merge_req_fn(q, rq, next);
542 579
543 elv_rqhash_reposition(q, rq); 580 elv_rqhash_reposition(q, rq);
544 elv_rqhash_del(q, next);
545 581
546 q->nr_sorted--; 582 if (next_sorted) {
583 elv_rqhash_del(q, next);
584 q->nr_sorted--;
585 }
586
547 q->last_merge = rq; 587 q->last_merge = rq;
548} 588}
549 589
@@ -617,21 +657,12 @@ void elv_quiesce_end(struct request_queue *q)
617 657
618void elv_insert(struct request_queue *q, struct request *rq, int where) 658void elv_insert(struct request_queue *q, struct request *rq, int where)
619{ 659{
620 int unplug_it = 1;
621
622 trace_block_rq_insert(q, rq); 660 trace_block_rq_insert(q, rq);
623 661
624 rq->q = q; 662 rq->q = q;
625 663
626 switch (where) { 664 switch (where) {
627 case ELEVATOR_INSERT_REQUEUE: 665 case ELEVATOR_INSERT_REQUEUE:
628 /*
629 * Most requeues happen because of a busy condition,
630 * don't force unplug of the queue for that case.
631 * Clear unplug_it and fall through.
632 */
633 unplug_it = 0;
634
635 case ELEVATOR_INSERT_FRONT: 666 case ELEVATOR_INSERT_FRONT:
636 rq->cmd_flags |= REQ_SOFTBARRIER; 667 rq->cmd_flags |= REQ_SOFTBARRIER;
637 list_add(&rq->queuelist, &q->queue_head); 668 list_add(&rq->queuelist, &q->queue_head);
@@ -654,6 +685,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
654 __blk_run_queue(q, false); 685 __blk_run_queue(q, false);
655 break; 686 break;
656 687
688 case ELEVATOR_INSERT_SORT_MERGE:
689 /*
690 * If we succeed in merging this request with one in the
691 * queue already, we are done - rq has now been freed,
692 * so no need to do anything further.
693 */
694 if (elv_attempt_insert_merge(q, rq))
695 break;
657 case ELEVATOR_INSERT_SORT: 696 case ELEVATOR_INSERT_SORT:
658 BUG_ON(rq->cmd_type != REQ_TYPE_FS && 697 BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
659 !(rq->cmd_flags & REQ_DISCARD)); 698 !(rq->cmd_flags & REQ_DISCARD));
@@ -673,24 +712,21 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
673 q->elevator->ops->elevator_add_req_fn(q, rq); 712 q->elevator->ops->elevator_add_req_fn(q, rq);
674 break; 713 break;
675 714
715 case ELEVATOR_INSERT_FLUSH:
716 rq->cmd_flags |= REQ_SOFTBARRIER;
717 blk_insert_flush(rq);
718 break;
676 default: 719 default:
677 printk(KERN_ERR "%s: bad insertion point %d\n", 720 printk(KERN_ERR "%s: bad insertion point %d\n",
678 __func__, where); 721 __func__, where);
679 BUG(); 722 BUG();
680 } 723 }
681
682 if (unplug_it && blk_queue_plugged(q)) {
683 int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
684 - queue_in_flight(q);
685
686 if (nrq >= q->unplug_thresh)
687 __generic_unplug_device(q);
688 }
689} 724}
690 725
691void __elv_add_request(struct request_queue *q, struct request *rq, int where, 726void __elv_add_request(struct request_queue *q, struct request *rq, int where)
692 int plug)
693{ 727{
728 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
729
694 if (rq->cmd_flags & REQ_SOFTBARRIER) { 730 if (rq->cmd_flags & REQ_SOFTBARRIER) {
695 /* barriers are scheduling boundary, update end_sector */ 731 /* barriers are scheduling boundary, update end_sector */
696 if (rq->cmd_type == REQ_TYPE_FS || 732 if (rq->cmd_type == REQ_TYPE_FS ||
@@ -702,38 +738,20 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
702 where == ELEVATOR_INSERT_SORT) 738 where == ELEVATOR_INSERT_SORT)
703 where = ELEVATOR_INSERT_BACK; 739 where = ELEVATOR_INSERT_BACK;
704 740
705 if (plug)
706 blk_plug_device(q);
707
708 elv_insert(q, rq, where); 741 elv_insert(q, rq, where);
709} 742}
710EXPORT_SYMBOL(__elv_add_request); 743EXPORT_SYMBOL(__elv_add_request);
711 744
712void elv_add_request(struct request_queue *q, struct request *rq, int where, 745void elv_add_request(struct request_queue *q, struct request *rq, int where)
713 int plug)
714{ 746{
715 unsigned long flags; 747 unsigned long flags;
716 748
717 spin_lock_irqsave(q->queue_lock, flags); 749 spin_lock_irqsave(q->queue_lock, flags);
718 __elv_add_request(q, rq, where, plug); 750 __elv_add_request(q, rq, where);
719 spin_unlock_irqrestore(q->queue_lock, flags); 751 spin_unlock_irqrestore(q->queue_lock, flags);
720} 752}
721EXPORT_SYMBOL(elv_add_request); 753EXPORT_SYMBOL(elv_add_request);
722 754
723int elv_queue_empty(struct request_queue *q)
724{
725 struct elevator_queue *e = q->elevator;
726
727 if (!list_empty(&q->queue_head))
728 return 0;
729
730 if (e->ops->elevator_queue_empty_fn)
731 return e->ops->elevator_queue_empty_fn(q);
732
733 return 1;
734}
735EXPORT_SYMBOL(elv_queue_empty);
736
737struct request *elv_latter_request(struct request_queue *q, struct request *rq) 755struct request *elv_latter_request(struct request_queue *q, struct request *rq)
738{ 756{
739 struct elevator_queue *e = q->elevator; 757 struct elevator_queue *e = q->elevator;
@@ -759,7 +777,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
759 if (e->ops->elevator_set_req_fn) 777 if (e->ops->elevator_set_req_fn)
760 return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 778 return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
761 779
762 rq->elevator_private = NULL; 780 rq->elevator_private[0] = NULL;
763 return 0; 781 return 0;
764} 782}
765 783
@@ -785,6 +803,8 @@ void elv_abort_queue(struct request_queue *q)
785{ 803{
786 struct request *rq; 804 struct request *rq;
787 805
806 blk_abort_flushes(q);
807
788 while (!list_empty(&q->queue_head)) { 808 while (!list_empty(&q->queue_head)) {
789 rq = list_entry_rq(q->queue_head.next); 809 rq = list_entry_rq(q->queue_head.next);
790 rq->cmd_flags |= REQ_QUIET; 810 rq->cmd_flags |= REQ_QUIET;
diff --git a/block/genhd.c b/block/genhd.c
index cbf1112a885c..c91a2dac6b6b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1158 "%u %lu %lu %llu %u %u %u %u\n", 1158 "%u %lu %lu %llu %u %u %u %u\n",
1159 MAJOR(part_devt(hd)), MINOR(part_devt(hd)), 1159 MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1160 disk_name(gp, hd->partno, buf), 1160 disk_name(gp, hd->partno, buf),
1161 part_stat_read(hd, ios[0]), 1161 part_stat_read(hd, ios[READ]),
1162 part_stat_read(hd, merges[0]), 1162 part_stat_read(hd, merges[READ]),
1163 (unsigned long long)part_stat_read(hd, sectors[0]), 1163 (unsigned long long)part_stat_read(hd, sectors[READ]),
1164 jiffies_to_msecs(part_stat_read(hd, ticks[0])), 1164 jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1165 part_stat_read(hd, ios[1]), 1165 part_stat_read(hd, ios[WRITE]),
1166 part_stat_read(hd, merges[1]), 1166 part_stat_read(hd, merges[WRITE]),
1167 (unsigned long long)part_stat_read(hd, sectors[1]), 1167 (unsigned long long)part_stat_read(hd, sectors[WRITE]),
1168 jiffies_to_msecs(part_stat_read(hd, ticks[1])), 1168 jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1169 part_in_flight(hd), 1169 part_in_flight(hd),
1170 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1170 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1171 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 1171 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
@@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk)
1494void disk_unblock_events(struct gendisk *disk) 1494void disk_unblock_events(struct gendisk *disk)
1495{ 1495{
1496 if (disk->ev) 1496 if (disk->ev)
1497 __disk_unblock_events(disk, true); 1497 __disk_unblock_events(disk, false);
1498} 1498}
1499 1499
1500/** 1500/**
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd37..06389e9ef96d 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
39 list_add_tail(&rq->queuelist, &nd->queue); 39 list_add_tail(&rq->queuelist, &nd->queue);
40} 40}
41 41
42static int noop_queue_empty(struct request_queue *q)
43{
44 struct noop_data *nd = q->elevator->elevator_data;
45
46 return list_empty(&nd->queue);
47}
48
49static struct request * 42static struct request *
50noop_former_request(struct request_queue *q, struct request *rq) 43noop_former_request(struct request_queue *q, struct request *rq)
51{ 44{
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
90 .elevator_merge_req_fn = noop_merged_requests, 83 .elevator_merge_req_fn = noop_merged_requests,
91 .elevator_dispatch_fn = noop_dispatch, 84 .elevator_dispatch_fn = noop_dispatch,
92 .elevator_add_req_fn = noop_add_request, 85 .elevator_add_req_fn = noop_add_request,
93 .elevator_queue_empty_fn = noop_queue_empty,
94 .elevator_former_req_fn = noop_former_request, 86 .elevator_former_req_fn = noop_former_request,
95 .elevator_latter_req_fn = noop_latter_request, 87 .elevator_latter_req_fn = noop_latter_request,
96 .elevator_init_fn = noop_init_queue, 88 .elevator_init_fn = noop_init_queue,
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 1f286ab461d3..79882104e431 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -140,13 +140,14 @@ static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
140 return 0; 140 return 0;
141} 141}
142 142
143static int DAC960_media_changed(struct gendisk *disk) 143static unsigned int DAC960_check_events(struct gendisk *disk,
144 unsigned int clearing)
144{ 145{
145 DAC960_Controller_T *p = disk->queue->queuedata; 146 DAC960_Controller_T *p = disk->queue->queuedata;
146 int drive_nr = (long)disk->private_data; 147 int drive_nr = (long)disk->private_data;
147 148
148 if (!p->LogicalDriveInitiallyAccessible[drive_nr]) 149 if (!p->LogicalDriveInitiallyAccessible[drive_nr])
149 return 1; 150 return DISK_EVENT_MEDIA_CHANGE;
150 return 0; 151 return 0;
151} 152}
152 153
@@ -163,7 +164,7 @@ static const struct block_device_operations DAC960_BlockDeviceOperations = {
163 .owner = THIS_MODULE, 164 .owner = THIS_MODULE,
164 .open = DAC960_open, 165 .open = DAC960_open,
165 .getgeo = DAC960_getgeo, 166 .getgeo = DAC960_getgeo,
166 .media_changed = DAC960_media_changed, 167 .check_events = DAC960_check_events,
167 .revalidate_disk = DAC960_revalidate_disk, 168 .revalidate_disk = DAC960_revalidate_disk,
168}; 169};
169 170
@@ -2546,6 +2547,7 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
2546 disk->major = MajorNumber; 2547 disk->major = MajorNumber;
2547 disk->first_minor = n << DAC960_MaxPartitionsBits; 2548 disk->first_minor = n << DAC960_MaxPartitionsBits;
2548 disk->fops = &DAC960_BlockDeviceOperations; 2549 disk->fops = &DAC960_BlockDeviceOperations;
2550 disk->events = DISK_EVENT_MEDIA_CHANGE;
2549 } 2551 }
2550 /* 2552 /*
2551 Indicate the Block Device Registration completed successfully, 2553 Indicate the Block Device Registration completed successfully,
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 363855ca376e..456c0cc90dcf 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1658,12 +1658,12 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
1658} 1658}
1659 1659
1660/* 1660/*
1661 * floppy-change is never called from an interrupt, so we can relax a bit 1661 * check_events is never called from an interrupt, so we can relax a bit
1662 * here, sleep etc. Note that floppy-on tries to set current_DOR to point 1662 * here, sleep etc. Note that floppy-on tries to set current_DOR to point
1663 * to the desired drive, but it will probably not survive the sleep if 1663 * to the desired drive, but it will probably not survive the sleep if
1664 * several floppies are used at the same time: thus the loop. 1664 * several floppies are used at the same time: thus the loop.
1665 */ 1665 */
1666static int amiga_floppy_change(struct gendisk *disk) 1666static unsigned amiga_check_events(struct gendisk *disk, unsigned int clearing)
1667{ 1667{
1668 struct amiga_floppy_struct *p = disk->private_data; 1668 struct amiga_floppy_struct *p = disk->private_data;
1669 int drive = p - unit; 1669 int drive = p - unit;
@@ -1686,7 +1686,7 @@ static int amiga_floppy_change(struct gendisk *disk)
1686 p->dirty = 0; 1686 p->dirty = 0;
1687 writepending = 0; /* if this was true before, too bad! */ 1687 writepending = 0; /* if this was true before, too bad! */
1688 writefromint = 0; 1688 writefromint = 0;
1689 return 1; 1689 return DISK_EVENT_MEDIA_CHANGE;
1690 } 1690 }
1691 return 0; 1691 return 0;
1692} 1692}
@@ -1697,7 +1697,7 @@ static const struct block_device_operations floppy_fops = {
1697 .release = floppy_release, 1697 .release = floppy_release,
1698 .ioctl = fd_ioctl, 1698 .ioctl = fd_ioctl,
1699 .getgeo = fd_getgeo, 1699 .getgeo = fd_getgeo,
1700 .media_changed = amiga_floppy_change, 1700 .check_events = amiga_check_events,
1701}; 1701};
1702 1702
1703static int __init fd_probe_drives(void) 1703static int __init fd_probe_drives(void)
@@ -1736,6 +1736,7 @@ static int __init fd_probe_drives(void)
1736 disk->major = FLOPPY_MAJOR; 1736 disk->major = FLOPPY_MAJOR;
1737 disk->first_minor = drive; 1737 disk->first_minor = drive;
1738 disk->fops = &floppy_fops; 1738 disk->fops = &floppy_fops;
1739 disk->events = DISK_EVENT_MEDIA_CHANGE;
1739 sprintf(disk->disk_name, "fd%d", drive); 1740 sprintf(disk->disk_name, "fd%d", drive);
1740 disk->private_data = &unit[drive]; 1741 disk->private_data = &unit[drive];
1741 set_capacity(disk, 880*2); 1742 set_capacity(disk, 880*2);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 605a67e40bbf..c871eae14120 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1324,23 +1324,24 @@ static void finish_fdc_done( int dummy )
1324 * due to unrecognised disk changes. 1324 * due to unrecognised disk changes.
1325 */ 1325 */
1326 1326
1327static int check_floppy_change(struct gendisk *disk) 1327static unsigned int floppy_check_events(struct gendisk *disk,
1328 unsigned int clearing)
1328{ 1329{
1329 struct atari_floppy_struct *p = disk->private_data; 1330 struct atari_floppy_struct *p = disk->private_data;
1330 unsigned int drive = p - unit; 1331 unsigned int drive = p - unit;
1331 if (test_bit (drive, &fake_change)) { 1332 if (test_bit (drive, &fake_change)) {
1332 /* simulated change (e.g. after formatting) */ 1333 /* simulated change (e.g. after formatting) */
1333 return 1; 1334 return DISK_EVENT_MEDIA_CHANGE;
1334 } 1335 }
1335 if (test_bit (drive, &changed_floppies)) { 1336 if (test_bit (drive, &changed_floppies)) {
1336 /* surely changed (the WP signal changed at least once) */ 1337 /* surely changed (the WP signal changed at least once) */
1337 return 1; 1338 return DISK_EVENT_MEDIA_CHANGE;
1338 } 1339 }
1339 if (UD.wpstat) { 1340 if (UD.wpstat) {
1340 /* WP is on -> could be changed: to be sure, buffers should be 1341 /* WP is on -> could be changed: to be sure, buffers should be
1341 * invalidated... 1342 * invalidated...
1342 */ 1343 */
1343 return 1; 1344 return DISK_EVENT_MEDIA_CHANGE;
1344 } 1345 }
1345 1346
1346 return 0; 1347 return 0;
@@ -1570,7 +1571,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
1570 * or the next access will revalidate - and clear UDT :-( 1571 * or the next access will revalidate - and clear UDT :-(
1571 */ 1572 */
1572 1573
1573 if (check_floppy_change(disk)) 1574 if (floppy_check_events(disk, 0))
1574 floppy_revalidate(disk); 1575 floppy_revalidate(disk);
1575 1576
1576 if (UD.flags & FTD_MSG) 1577 if (UD.flags & FTD_MSG)
@@ -1904,7 +1905,7 @@ static const struct block_device_operations floppy_fops = {
1904 .open = floppy_unlocked_open, 1905 .open = floppy_unlocked_open,
1905 .release = floppy_release, 1906 .release = floppy_release,
1906 .ioctl = fd_ioctl, 1907 .ioctl = fd_ioctl,
1907 .media_changed = check_floppy_change, 1908 .check_events = floppy_check_events,
1908 .revalidate_disk= floppy_revalidate, 1909 .revalidate_disk= floppy_revalidate,
1909}; 1910};
1910 1911
@@ -1963,6 +1964,7 @@ static int __init atari_floppy_init (void)
1963 unit[i].disk->first_minor = i; 1964 unit[i].disk->first_minor = i;
1964 sprintf(unit[i].disk->disk_name, "fd%d", i); 1965 sprintf(unit[i].disk->disk_name, "fd%d", i);
1965 unit[i].disk->fops = &floppy_fops; 1966 unit[i].disk->fops = &floppy_fops;
1967 unit[i].disk->events = DISK_EVENT_MEDIA_CHANGE;
1966 unit[i].disk->private_data = &unit[i]; 1968 unit[i].disk->private_data = &unit[i];
1967 unit[i].disk->queue = blk_init_queue(do_fd_request, 1969 unit[i].disk->queue = blk_init_queue(do_fd_request,
1968 &ataflop_lock); 1970 &ataflop_lock);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9279272b3732..35658f445fca 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3170,12 +3170,6 @@ static void do_cciss_request(struct request_queue *q)
3170 int sg_index = 0; 3170 int sg_index = 0;
3171 int chained = 0; 3171 int chained = 0;
3172 3172
3173 /* We call start_io here in case there is a command waiting on the
3174 * queue that has not been sent.
3175 */
3176 if (blk_queue_plugged(q))
3177 goto startio;
3178
3179 queue: 3173 queue:
3180 creq = blk_peek_request(q); 3174 creq = blk_peek_request(q);
3181 if (!creq) 3175 if (!creq)
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 946dad4caef3..b2fceb53e809 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -911,9 +911,6 @@ static void do_ida_request(struct request_queue *q)
911 struct scatterlist tmp_sg[SG_MAX]; 911 struct scatterlist tmp_sg[SG_MAX];
912 int i, dir, seg; 912 int i, dir, seg;
913 913
914 if (blk_queue_plugged(q))
915 goto startio;
916
917queue_next: 914queue_next:
918 creq = blk_peek_request(q); 915 creq = blk_peek_request(q);
919 if (!creq) 916 if (!creq)
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index ba95cba192be..aca302492ff2 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -80,7 +80,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
80 80
81 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) 81 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
82 rw |= REQ_FUA; 82 rw |= REQ_FUA;
83 rw |= REQ_UNPLUG | REQ_SYNC; 83 rw |= REQ_SYNC;
84 84
85 bio = bio_alloc(GFP_NOIO, 1); 85 bio = bio_alloc(GFP_NOIO, 1);
86 bio->bi_bdev = bdev->md_bdev; 86 bio->bi_bdev = bdev->md_bdev;
@@ -689,8 +689,6 @@ void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
689 } 689 }
690 } 690 }
691 691
692 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
693
694 /* always (try to) flush bitmap to stable storage */ 692 /* always (try to) flush bitmap to stable storage */
695 drbd_md_flush(mdev); 693 drbd_md_flush(mdev);
696 694
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index fd42832f785b..0645ca829a94 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -840,7 +840,6 @@ static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
840 for (i = 0; i < num_pages; i++) 840 for (i = 0; i < num_pages; i++)
841 bm_page_io_async(mdev, b, i, rw); 841 bm_page_io_async(mdev, b, i, rw);
842 842
843 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
844 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); 843 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
845 844
846 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { 845 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3803a0348937..b0bd27dfc1e8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -377,7 +377,7 @@ union p_header {
377#define DP_HARDBARRIER 1 /* depricated */ 377#define DP_HARDBARRIER 1 /* depricated */
378#define DP_RW_SYNC 2 /* equals REQ_SYNC */ 378#define DP_RW_SYNC 2 /* equals REQ_SYNC */
379#define DP_MAY_SET_IN_SYNC 4 379#define DP_MAY_SET_IN_SYNC 4
380#define DP_UNPLUG 8 /* equals REQ_UNPLUG */ 380#define DP_UNPLUG 8 /* not used anymore */
381#define DP_FUA 16 /* equals REQ_FUA */ 381#define DP_FUA 16 /* equals REQ_FUA */
382#define DP_FLUSH 32 /* equals REQ_FLUSH */ 382#define DP_FLUSH 32 /* equals REQ_FLUSH */
383#define DP_DISCARD 64 /* equals REQ_DISCARD */ 383#define DP_DISCARD 64 /* equals REQ_DISCARD */
@@ -2382,20 +2382,6 @@ static inline int drbd_queue_order_type(struct drbd_conf *mdev)
2382 return QUEUE_ORDERED_NONE; 2382 return QUEUE_ORDERED_NONE;
2383} 2383}
2384 2384
2385static inline void drbd_blk_run_queue(struct request_queue *q)
2386{
2387 if (q && q->unplug_fn)
2388 q->unplug_fn(q);
2389}
2390
2391static inline void drbd_kick_lo(struct drbd_conf *mdev)
2392{
2393 if (get_ldev(mdev)) {
2394 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
2395 put_ldev(mdev);
2396 }
2397}
2398
2399static inline void drbd_md_flush(struct drbd_conf *mdev) 2385static inline void drbd_md_flush(struct drbd_conf *mdev)
2400{ 2386{
2401 int r; 2387 int r;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 29cd0dc9fe4f..8a43ce0edeed 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2477,12 +2477,11 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2477{ 2477{
2478 if (mdev->agreed_pro_version >= 95) 2478 if (mdev->agreed_pro_version >= 95)
2479 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | 2479 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2480 (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
2481 (bi_rw & REQ_FUA ? DP_FUA : 0) | 2480 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2482 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | 2481 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2483 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); 2482 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2484 else 2483 else
2485 return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0; 2484 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2486} 2485}
2487 2486
2488/* Used to send write requests 2487/* Used to send write requests
@@ -2719,35 +2718,6 @@ static int drbd_release(struct gendisk *gd, fmode_t mode)
2719 return 0; 2718 return 0;
2720} 2719}
2721 2720
2722static void drbd_unplug_fn(struct request_queue *q)
2723{
2724 struct drbd_conf *mdev = q->queuedata;
2725
2726 /* unplug FIRST */
2727 spin_lock_irq(q->queue_lock);
2728 blk_remove_plug(q);
2729 spin_unlock_irq(q->queue_lock);
2730
2731 /* only if connected */
2732 spin_lock_irq(&mdev->req_lock);
2733 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2734 D_ASSERT(mdev->state.role == R_PRIMARY);
2735 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2736 /* add to the data.work queue,
2737 * unless already queued.
2738 * XXX this might be a good addition to drbd_queue_work
2739 * anyways, to detect "double queuing" ... */
2740 if (list_empty(&mdev->unplug_work.list))
2741 drbd_queue_work(&mdev->data.work,
2742 &mdev->unplug_work);
2743 }
2744 }
2745 spin_unlock_irq(&mdev->req_lock);
2746
2747 if (mdev->state.disk >= D_INCONSISTENT)
2748 drbd_kick_lo(mdev);
2749}
2750
2751static void drbd_set_defaults(struct drbd_conf *mdev) 2721static void drbd_set_defaults(struct drbd_conf *mdev)
2752{ 2722{
2753 /* This way we get a compile error when sync_conf grows, 2723 /* This way we get a compile error when sync_conf grows,
@@ -3222,9 +3192,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
3222 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); 3192 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
3223 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 3193 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3224 blk_queue_merge_bvec(q, drbd_merge_bvec); 3194 blk_queue_merge_bvec(q, drbd_merge_bvec);
3225 q->queue_lock = &mdev->req_lock; /* needed since we use */ 3195 q->queue_lock = &mdev->req_lock;
3226 /* plugging on a queue, that actually has no requests! */
3227 q->unplug_fn = drbd_unplug_fn;
3228 3196
3229 mdev->md_io_page = alloc_page(GFP_KERNEL); 3197 mdev->md_io_page = alloc_page(GFP_KERNEL);
3230 if (!mdev->md_io_page) 3198 if (!mdev->md_io_page)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 24487d4fb202..8e68be939deb 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -187,15 +187,6 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
187 return NULL; 187 return NULL;
188} 188}
189 189
190/* kick lower level device, if we have more than (arbitrary number)
191 * reference counts on it, which typically are locally submitted io
192 * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
193static void maybe_kick_lo(struct drbd_conf *mdev)
194{
195 if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
196 drbd_kick_lo(mdev);
197}
198
199static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) 190static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
200{ 191{
201 struct drbd_epoch_entry *e; 192 struct drbd_epoch_entry *e;
@@ -219,7 +210,6 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
219 LIST_HEAD(reclaimed); 210 LIST_HEAD(reclaimed);
220 struct drbd_epoch_entry *e, *t; 211 struct drbd_epoch_entry *e, *t;
221 212
222 maybe_kick_lo(mdev);
223 spin_lock_irq(&mdev->req_lock); 213 spin_lock_irq(&mdev->req_lock);
224 reclaim_net_ee(mdev, &reclaimed); 214 reclaim_net_ee(mdev, &reclaimed);
225 spin_unlock_irq(&mdev->req_lock); 215 spin_unlock_irq(&mdev->req_lock);
@@ -436,8 +426,7 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
436 while (!list_empty(head)) { 426 while (!list_empty(head)) {
437 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); 427 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
438 spin_unlock_irq(&mdev->req_lock); 428 spin_unlock_irq(&mdev->req_lock);
439 drbd_kick_lo(mdev); 429 io_schedule();
440 schedule();
441 finish_wait(&mdev->ee_wait, &wait); 430 finish_wait(&mdev->ee_wait, &wait);
442 spin_lock_irq(&mdev->req_lock); 431 spin_lock_irq(&mdev->req_lock);
443 } 432 }
@@ -1111,8 +1100,6 @@ next_bio:
1111 /* > e->sector, unless this is the first bio */ 1100 /* > e->sector, unless this is the first bio */
1112 bio->bi_sector = sector; 1101 bio->bi_sector = sector;
1113 bio->bi_bdev = mdev->ldev->backing_bdev; 1102 bio->bi_bdev = mdev->ldev->backing_bdev;
1114 /* we special case some flags in the multi-bio case, see below
1115 * (REQ_UNPLUG) */
1116 bio->bi_rw = rw; 1103 bio->bi_rw = rw;
1117 bio->bi_private = e; 1104 bio->bi_private = e;
1118 bio->bi_end_io = drbd_endio_sec; 1105 bio->bi_end_io = drbd_endio_sec;
@@ -1141,13 +1128,8 @@ next_bio:
1141 bios = bios->bi_next; 1128 bios = bios->bi_next;
1142 bio->bi_next = NULL; 1129 bio->bi_next = NULL;
1143 1130
1144 /* strip off REQ_UNPLUG unless it is the last bio */
1145 if (bios)
1146 bio->bi_rw &= ~REQ_UNPLUG;
1147
1148 drbd_generic_make_request(mdev, fault_type, bio); 1131 drbd_generic_make_request(mdev, fault_type, bio);
1149 } while (bios); 1132 } while (bios);
1150 maybe_kick_lo(mdev);
1151 return 0; 1133 return 0;
1152 1134
1153fail: 1135fail:
@@ -1167,9 +1149,6 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
1167 1149
1168 inc_unacked(mdev); 1150 inc_unacked(mdev);
1169 1151
1170 if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1171 drbd_kick_lo(mdev);
1172
1173 mdev->current_epoch->barrier_nr = p->barrier; 1152 mdev->current_epoch->barrier_nr = p->barrier;
1174 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); 1153 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1175 1154
@@ -1636,12 +1615,11 @@ static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
1636{ 1615{
1637 if (mdev->agreed_pro_version >= 95) 1616 if (mdev->agreed_pro_version >= 95)
1638 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | 1617 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1639 (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
1640 (dpf & DP_FUA ? REQ_FUA : 0) | 1618 (dpf & DP_FUA ? REQ_FUA : 0) |
1641 (dpf & DP_FLUSH ? REQ_FUA : 0) | 1619 (dpf & DP_FLUSH ? REQ_FUA : 0) |
1642 (dpf & DP_DISCARD ? REQ_DISCARD : 0); 1620 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
1643 else 1621 else
1644 return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0; 1622 return dpf & DP_RW_SYNC ? REQ_SYNC : 0;
1645} 1623}
1646 1624
1647/* mirrored write */ 1625/* mirrored write */
@@ -3556,9 +3534,6 @@ static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3556 3534
3557static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 3535static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3558{ 3536{
3559 if (mdev->state.disk >= D_INCONSISTENT)
3560 drbd_kick_lo(mdev);
3561
3562 /* Make sure we've acked all the TCP data associated 3537 /* Make sure we've acked all the TCP data associated
3563 * with the data requests being unplugged */ 3538 * with the data requests being unplugged */
3564 drbd_tcp_quickack(mdev->data.socket); 3539 drbd_tcp_quickack(mdev->data.socket);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 11a75d32a2e2..ad3fc6228f27 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -960,10 +960,6 @@ allocate_barrier:
960 bio_endio(req->private_bio, -EIO); 960 bio_endio(req->private_bio, -EIO);
961 } 961 }
962 962
963 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
964 * we plug after submit, so we won't miss an unplug event */
965 drbd_plug_device(mdev);
966
967 return 0; 963 return 0;
968 964
969fail_conflicting: 965fail_conflicting:
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 34f224b018b3..e027446590d3 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -792,7 +792,6 @@ int drbd_resync_finished(struct drbd_conf *mdev)
792 * queue (or even the read operations for those packets 792 * queue (or even the read operations for those packets
793 * is not finished by now). Retry in 100ms. */ 793 * is not finished by now). Retry in 100ms. */
794 794
795 drbd_kick_lo(mdev);
796 __set_current_state(TASK_INTERRUPTIBLE); 795 __set_current_state(TASK_INTERRUPTIBLE);
797 schedule_timeout(HZ / 10); 796 schedule_timeout(HZ / 10);
798 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); 797 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index defdb5013ea3..53586fa5ae1b 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -45,24 +45,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
45 generic_make_request(bio); 45 generic_make_request(bio);
46} 46}
47 47
48static inline void drbd_plug_device(struct drbd_conf *mdev)
49{
50 struct request_queue *q;
51 q = bdev_get_queue(mdev->this_bdev);
52
53 spin_lock_irq(q->queue_lock);
54
55/* XXX the check on !blk_queue_plugged is redundant,
56 * implicitly checked in blk_plug_device */
57
58 if (!blk_queue_plugged(q)) {
59 blk_plug_device(q);
60 del_timer(&q->unplug_timer);
61 /* unplugging should not happen automatically... */
62 }
63 spin_unlock_irq(q->queue_lock);
64}
65
66static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) 48static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
67{ 49{
68 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) 50 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 77fc76f8aea9..301d7a9a41a6 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3770,13 +3770,14 @@ out2:
3770/* 3770/*
3771 * Check if the disk has been changed or if a change has been faked. 3771 * Check if the disk has been changed or if a change has been faked.
3772 */ 3772 */
3773static int check_floppy_change(struct gendisk *disk) 3773static unsigned int floppy_check_events(struct gendisk *disk,
3774 unsigned int clearing)
3774{ 3775{
3775 int drive = (long)disk->private_data; 3776 int drive = (long)disk->private_data;
3776 3777
3777 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || 3778 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3778 test_bit(FD_VERIFY_BIT, &UDRS->flags)) 3779 test_bit(FD_VERIFY_BIT, &UDRS->flags))
3779 return 1; 3780 return DISK_EVENT_MEDIA_CHANGE;
3780 3781
3781 if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) { 3782 if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
3782 lock_fdc(drive, false); 3783 lock_fdc(drive, false);
@@ -3788,7 +3789,7 @@ static int check_floppy_change(struct gendisk *disk)
3788 test_bit(FD_VERIFY_BIT, &UDRS->flags) || 3789 test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
3789 test_bit(drive, &fake_change) || 3790 test_bit(drive, &fake_change) ||
3790 drive_no_geom(drive)) 3791 drive_no_geom(drive))
3791 return 1; 3792 return DISK_EVENT_MEDIA_CHANGE;
3792 return 0; 3793 return 0;
3793} 3794}
3794 3795
@@ -3837,7 +3838,6 @@ static int __floppy_read_block_0(struct block_device *bdev)
3837 bio.bi_end_io = floppy_rb0_complete; 3838 bio.bi_end_io = floppy_rb0_complete;
3838 3839
3839 submit_bio(READ, &bio); 3840 submit_bio(READ, &bio);
3840 generic_unplug_device(bdev_get_queue(bdev));
3841 process_fd_request(); 3841 process_fd_request();
3842 wait_for_completion(&complete); 3842 wait_for_completion(&complete);
3843 3843
@@ -3898,7 +3898,7 @@ static const struct block_device_operations floppy_fops = {
3898 .release = floppy_release, 3898 .release = floppy_release,
3899 .ioctl = fd_ioctl, 3899 .ioctl = fd_ioctl,
3900 .getgeo = fd_getgeo, 3900 .getgeo = fd_getgeo,
3901 .media_changed = check_floppy_change, 3901 .check_events = floppy_check_events,
3902 .revalidate_disk = floppy_revalidate, 3902 .revalidate_disk = floppy_revalidate,
3903}; 3903};
3904 3904
@@ -4205,6 +4205,7 @@ static int __init floppy_init(void)
4205 disks[dr]->major = FLOPPY_MAJOR; 4205 disks[dr]->major = FLOPPY_MAJOR;
4206 disks[dr]->first_minor = TOMINOR(dr); 4206 disks[dr]->first_minor = TOMINOR(dr);
4207 disks[dr]->fops = &floppy_fops; 4207 disks[dr]->fops = &floppy_fops;
4208 disks[dr]->events = DISK_EVENT_MEDIA_CHANGE;
4208 sprintf(disks[dr]->disk_name, "fd%d", dr); 4209 sprintf(disks[dr]->disk_name, "fd%d", dr);
4209 4210
4210 init_timer(&motor_off_timer[dr]); 4211 init_timer(&motor_off_timer[dr]);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index dbf31ec9114d..a076a14ca72d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -540,17 +540,6 @@ out:
540 return 0; 540 return 0;
541} 541}
542 542
543/*
544 * kick off io on the underlying address space
545 */
546static void loop_unplug(struct request_queue *q)
547{
548 struct loop_device *lo = q->queuedata;
549
550 queue_flag_clear_unlocked(QUEUE_FLAG_PLUGGED, q);
551 blk_run_address_space(lo->lo_backing_file->f_mapping);
552}
553
554struct switch_request { 543struct switch_request {
555 struct file *file; 544 struct file *file;
556 struct completion wait; 545 struct completion wait;
@@ -917,7 +906,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
917 */ 906 */
918 blk_queue_make_request(lo->lo_queue, loop_make_request); 907 blk_queue_make_request(lo->lo_queue, loop_make_request);
919 lo->lo_queue->queuedata = lo; 908 lo->lo_queue->queuedata = lo;
920 lo->lo_queue->unplug_fn = loop_unplug;
921 909
922 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) 910 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
923 blk_queue_flush(lo->lo_queue, REQ_FLUSH); 911 blk_queue_flush(lo->lo_queue, REQ_FLUSH);
@@ -1019,7 +1007,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
1019 1007
1020 kthread_stop(lo->lo_thread); 1008 kthread_stop(lo->lo_thread);
1021 1009
1022 lo->lo_queue->unplug_fn = NULL;
1023 lo->lo_backing_file = NULL; 1010 lo->lo_backing_file = NULL;
1024 1011
1025 loop_release_xfer(lo); 1012 loop_release_xfer(lo);
@@ -1636,9 +1623,6 @@ out:
1636 1623
1637static void loop_free(struct loop_device *lo) 1624static void loop_free(struct loop_device *lo)
1638{ 1625{
1639 if (!lo->lo_queue->queue_lock)
1640 lo->lo_queue->queue_lock = &lo->lo_queue->__queue_lock;
1641
1642 blk_cleanup_queue(lo->lo_queue); 1626 blk_cleanup_queue(lo->lo_queue);
1643 put_disk(lo->lo_disk); 1627 put_disk(lo->lo_disk);
1644 list_del(&lo->lo_list); 1628 list_del(&lo->lo_list);
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 62cec6afd7ad..2f2ccf686251 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -172,7 +172,8 @@ module_param_array(drive3, int, NULL, 0);
172static int pcd_open(struct cdrom_device_info *cdi, int purpose); 172static int pcd_open(struct cdrom_device_info *cdi, int purpose);
173static void pcd_release(struct cdrom_device_info *cdi); 173static void pcd_release(struct cdrom_device_info *cdi);
174static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr); 174static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr);
175static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr); 175static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
176 unsigned int clearing, int slot_nr);
176static int pcd_tray_move(struct cdrom_device_info *cdi, int position); 177static int pcd_tray_move(struct cdrom_device_info *cdi, int position);
177static int pcd_lock_door(struct cdrom_device_info *cdi, int lock); 178static int pcd_lock_door(struct cdrom_device_info *cdi, int lock);
178static int pcd_drive_reset(struct cdrom_device_info *cdi); 179static int pcd_drive_reset(struct cdrom_device_info *cdi);
@@ -257,10 +258,11 @@ static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
257 return ret; 258 return ret;
258} 259}
259 260
260static int pcd_block_media_changed(struct gendisk *disk) 261static unsigned int pcd_block_check_events(struct gendisk *disk,
262 unsigned int clearing)
261{ 263{
262 struct pcd_unit *cd = disk->private_data; 264 struct pcd_unit *cd = disk->private_data;
263 return cdrom_media_changed(&cd->info); 265 return cdrom_check_events(&cd->info, clearing);
264} 266}
265 267
266static const struct block_device_operations pcd_bdops = { 268static const struct block_device_operations pcd_bdops = {
@@ -268,14 +270,14 @@ static const struct block_device_operations pcd_bdops = {
268 .open = pcd_block_open, 270 .open = pcd_block_open,
269 .release = pcd_block_release, 271 .release = pcd_block_release,
270 .ioctl = pcd_block_ioctl, 272 .ioctl = pcd_block_ioctl,
271 .media_changed = pcd_block_media_changed, 273 .check_events = pcd_block_check_events,
272}; 274};
273 275
274static struct cdrom_device_ops pcd_dops = { 276static struct cdrom_device_ops pcd_dops = {
275 .open = pcd_open, 277 .open = pcd_open,
276 .release = pcd_release, 278 .release = pcd_release,
277 .drive_status = pcd_drive_status, 279 .drive_status = pcd_drive_status,
278 .media_changed = pcd_media_changed, 280 .check_events = pcd_check_events,
279 .tray_move = pcd_tray_move, 281 .tray_move = pcd_tray_move,
280 .lock_door = pcd_lock_door, 282 .lock_door = pcd_lock_door,
281 .get_mcn = pcd_get_mcn, 283 .get_mcn = pcd_get_mcn,
@@ -318,6 +320,7 @@ static void pcd_init_units(void)
318 disk->first_minor = unit; 320 disk->first_minor = unit;
319 strcpy(disk->disk_name, cd->name); /* umm... */ 321 strcpy(disk->disk_name, cd->name); /* umm... */
320 disk->fops = &pcd_bdops; 322 disk->fops = &pcd_bdops;
323 disk->events = DISK_EVENT_MEDIA_CHANGE;
321 } 324 }
322} 325}
323 326
@@ -502,13 +505,14 @@ static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc)
502 505
503#define DBMSG(msg) ((verbose>1)?(msg):NULL) 506#define DBMSG(msg) ((verbose>1)?(msg):NULL)
504 507
505static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr) 508static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
509 unsigned int clearing, int slot_nr)
506{ 510{
507 struct pcd_unit *cd = cdi->handle; 511 struct pcd_unit *cd = cdi->handle;
508 int res = cd->changed; 512 int res = cd->changed;
509 if (res) 513 if (res)
510 cd->changed = 0; 514 cd->changed = 0;
511 return res; 515 return res ? DISK_EVENT_MEDIA_CHANGE : 0;
512} 516}
513 517
514static int pcd_lock_door(struct cdrom_device_info *cdi, int lock) 518static int pcd_lock_door(struct cdrom_device_info *cdi, int lock)
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index c0ee1558b9bb..21dfdb776869 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -794,7 +794,7 @@ static int pd_release(struct gendisk *p, fmode_t mode)
794 return 0; 794 return 0;
795} 795}
796 796
797static int pd_check_media(struct gendisk *p) 797static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
798{ 798{
799 struct pd_unit *disk = p->private_data; 799 struct pd_unit *disk = p->private_data;
800 int r; 800 int r;
@@ -803,7 +803,7 @@ static int pd_check_media(struct gendisk *p)
803 pd_special_command(disk, pd_media_check); 803 pd_special_command(disk, pd_media_check);
804 r = disk->changed; 804 r = disk->changed;
805 disk->changed = 0; 805 disk->changed = 0;
806 return r; 806 return r ? DISK_EVENT_MEDIA_CHANGE : 0;
807} 807}
808 808
809static int pd_revalidate(struct gendisk *p) 809static int pd_revalidate(struct gendisk *p)
@@ -822,7 +822,7 @@ static const struct block_device_operations pd_fops = {
822 .release = pd_release, 822 .release = pd_release,
823 .ioctl = pd_ioctl, 823 .ioctl = pd_ioctl,
824 .getgeo = pd_getgeo, 824 .getgeo = pd_getgeo,
825 .media_changed = pd_check_media, 825 .check_events = pd_check_events,
826 .revalidate_disk= pd_revalidate 826 .revalidate_disk= pd_revalidate
827}; 827};
828 828
@@ -837,6 +837,7 @@ static void pd_probe_drive(struct pd_unit *disk)
837 p->fops = &pd_fops; 837 p->fops = &pd_fops;
838 p->major = major; 838 p->major = major;
839 p->first_minor = (disk - pd) << PD_BITS; 839 p->first_minor = (disk - pd) << PD_BITS;
840 p->events = DISK_EVENT_MEDIA_CHANGE;
840 disk->gd = p; 841 disk->gd = p;
841 p->private_data = disk; 842 p->private_data = disk;
842 p->queue = pd_queue; 843 p->queue = pd_queue;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 635f25dd9e10..7adeb1edbf43 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -243,7 +243,8 @@ static struct pf_unit units[PF_UNITS];
243static int pf_identify(struct pf_unit *pf); 243static int pf_identify(struct pf_unit *pf);
244static void pf_lock(struct pf_unit *pf, int func); 244static void pf_lock(struct pf_unit *pf, int func);
245static void pf_eject(struct pf_unit *pf); 245static void pf_eject(struct pf_unit *pf);
246static int pf_check_media(struct gendisk *disk); 246static unsigned int pf_check_events(struct gendisk *disk,
247 unsigned int clearing);
247 248
248static char pf_scratch[512]; /* scratch block buffer */ 249static char pf_scratch[512]; /* scratch block buffer */
249 250
@@ -270,7 +271,7 @@ static const struct block_device_operations pf_fops = {
270 .release = pf_release, 271 .release = pf_release,
271 .ioctl = pf_ioctl, 272 .ioctl = pf_ioctl,
272 .getgeo = pf_getgeo, 273 .getgeo = pf_getgeo,
273 .media_changed = pf_check_media, 274 .check_events = pf_check_events,
274}; 275};
275 276
276static void __init pf_init_units(void) 277static void __init pf_init_units(void)
@@ -293,6 +294,7 @@ static void __init pf_init_units(void)
293 disk->first_minor = unit; 294 disk->first_minor = unit;
294 strcpy(disk->disk_name, pf->name); 295 strcpy(disk->disk_name, pf->name);
295 disk->fops = &pf_fops; 296 disk->fops = &pf_fops;
297 disk->events = DISK_EVENT_MEDIA_CHANGE;
296 if (!(*drives[unit])[D_PRT]) 298 if (!(*drives[unit])[D_PRT])
297 pf_drive_count++; 299 pf_drive_count++;
298 } 300 }
@@ -377,9 +379,9 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
377 379
378} 380}
379 381
380static int pf_check_media(struct gendisk *disk) 382static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
381{ 383{
382 return 1; 384 return DISK_EVENT_MEDIA_CHANGE;
383} 385}
384 386
385static inline int status_reg(struct pf_unit *pf) 387static inline int status_reg(struct pf_unit *pf)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 77d70eebb6b2..07a382eaf0a8 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1606,8 +1606,6 @@ static int kcdrwd(void *foobar)
1606 min_sleep_time = pkt->sleep_time; 1606 min_sleep_time = pkt->sleep_time;
1607 } 1607 }
1608 1608
1609 generic_unplug_device(bdev_get_queue(pd->bdev));
1610
1611 VPRINTK("kcdrwd: sleeping\n"); 1609 VPRINTK("kcdrwd: sleeping\n");
1612 residue = schedule_timeout(min_sleep_time); 1610 residue = schedule_timeout(min_sleep_time);
1613 VPRINTK("kcdrwd: wake up\n"); 1611 VPRINTK("kcdrwd: wake up\n");
@@ -2796,7 +2794,8 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
2796 return ret; 2794 return ret;
2797} 2795}
2798 2796
2799static int pkt_media_changed(struct gendisk *disk) 2797static unsigned int pkt_check_events(struct gendisk *disk,
2798 unsigned int clearing)
2800{ 2799{
2801 struct pktcdvd_device *pd = disk->private_data; 2800 struct pktcdvd_device *pd = disk->private_data;
2802 struct gendisk *attached_disk; 2801 struct gendisk *attached_disk;
@@ -2806,9 +2805,9 @@ static int pkt_media_changed(struct gendisk *disk)
2806 if (!pd->bdev) 2805 if (!pd->bdev)
2807 return 0; 2806 return 0;
2808 attached_disk = pd->bdev->bd_disk; 2807 attached_disk = pd->bdev->bd_disk;
2809 if (!attached_disk) 2808 if (!attached_disk || !attached_disk->fops->check_events)
2810 return 0; 2809 return 0;
2811 return attached_disk->fops->media_changed(attached_disk); 2810 return attached_disk->fops->check_events(attached_disk, clearing);
2812} 2811}
2813 2812
2814static const struct block_device_operations pktcdvd_ops = { 2813static const struct block_device_operations pktcdvd_ops = {
@@ -2816,7 +2815,7 @@ static const struct block_device_operations pktcdvd_ops = {
2816 .open = pkt_open, 2815 .open = pkt_open,
2817 .release = pkt_close, 2816 .release = pkt_close,
2818 .ioctl = pkt_ioctl, 2817 .ioctl = pkt_ioctl,
2819 .media_changed = pkt_media_changed, 2818 .check_events = pkt_check_events,
2820}; 2819};
2821 2820
2822static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode) 2821static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
@@ -2889,6 +2888,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
2889 if (ret) 2888 if (ret)
2890 goto out_new_dev; 2889 goto out_new_dev;
2891 2890
2891 /* inherit events of the host device */
2892 disk->events = pd->bdev->bd_disk->events;
2893 disk->async_events = pd->bdev->bd_disk->async_events;
2894
2892 add_disk(disk); 2895 add_disk(disk);
2893 2896
2894 pkt_sysfs_dev_new(pd); 2897 pkt_sysfs_dev_new(pd);
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 75333d0a3327..24a482f2fbd6 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -741,11 +741,12 @@ static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
741 return 0; 741 return 0;
742} 742}
743 743
744static int floppy_check_change(struct gendisk *disk) 744static unsigned int floppy_check_events(struct gendisk *disk,
745 unsigned int clearing)
745{ 746{
746 struct floppy_state *fs = disk->private_data; 747 struct floppy_state *fs = disk->private_data;
747 748
748 return fs->ejected; 749 return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
749} 750}
750 751
751static int floppy_revalidate(struct gendisk *disk) 752static int floppy_revalidate(struct gendisk *disk)
@@ -772,7 +773,7 @@ static const struct block_device_operations floppy_fops = {
772 .release = floppy_release, 773 .release = floppy_release,
773 .ioctl = floppy_ioctl, 774 .ioctl = floppy_ioctl,
774 .getgeo = floppy_getgeo, 775 .getgeo = floppy_getgeo,
775 .media_changed = floppy_check_change, 776 .check_events = floppy_check_events,
776 .revalidate_disk = floppy_revalidate, 777 .revalidate_disk = floppy_revalidate,
777}; 778};
778 779
@@ -857,6 +858,7 @@ static int __devinit swim_floppy_init(struct swim_priv *swd)
857 swd->unit[drive].disk->first_minor = drive; 858 swd->unit[drive].disk->first_minor = drive;
858 sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); 859 sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
859 swd->unit[drive].disk->fops = &floppy_fops; 860 swd->unit[drive].disk->fops = &floppy_fops;
861 swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE;
860 swd->unit[drive].disk->private_data = &swd->unit[drive]; 862 swd->unit[drive].disk->private_data = &swd->unit[drive];
861 swd->unit[drive].disk->queue = swd->queue; 863 swd->unit[drive].disk->queue = swd->queue;
862 set_capacity(swd->unit[drive].disk, 2880); 864 set_capacity(swd->unit[drive].disk, 2880);
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index bf3a5b859299..4c10f56facbf 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -250,7 +250,8 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
250 unsigned int cmd, unsigned long param); 250 unsigned int cmd, unsigned long param);
251static int floppy_open(struct block_device *bdev, fmode_t mode); 251static int floppy_open(struct block_device *bdev, fmode_t mode);
252static int floppy_release(struct gendisk *disk, fmode_t mode); 252static int floppy_release(struct gendisk *disk, fmode_t mode);
253static int floppy_check_change(struct gendisk *disk); 253static unsigned int floppy_check_events(struct gendisk *disk,
254 unsigned int clearing);
254static int floppy_revalidate(struct gendisk *disk); 255static int floppy_revalidate(struct gendisk *disk);
255 256
256static bool swim3_end_request(int err, unsigned int nr_bytes) 257static bool swim3_end_request(int err, unsigned int nr_bytes)
@@ -975,10 +976,11 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
975 return 0; 976 return 0;
976} 977}
977 978
978static int floppy_check_change(struct gendisk *disk) 979static unsigned int floppy_check_events(struct gendisk *disk,
980 unsigned int clearing)
979{ 981{
980 struct floppy_state *fs = disk->private_data; 982 struct floppy_state *fs = disk->private_data;
981 return fs->ejected; 983 return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
982} 984}
983 985
984static int floppy_revalidate(struct gendisk *disk) 986static int floppy_revalidate(struct gendisk *disk)
@@ -1025,7 +1027,7 @@ static const struct block_device_operations floppy_fops = {
1025 .open = floppy_unlocked_open, 1027 .open = floppy_unlocked_open,
1026 .release = floppy_release, 1028 .release = floppy_release,
1027 .ioctl = floppy_ioctl, 1029 .ioctl = floppy_ioctl,
1028 .media_changed = floppy_check_change, 1030 .check_events = floppy_check_events,
1029 .revalidate_disk= floppy_revalidate, 1031 .revalidate_disk= floppy_revalidate,
1030}; 1032};
1031 1033
@@ -1161,6 +1163,7 @@ static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device
1161 disk->major = FLOPPY_MAJOR; 1163 disk->major = FLOPPY_MAJOR;
1162 disk->first_minor = i; 1164 disk->first_minor = i;
1163 disk->fops = &floppy_fops; 1165 disk->fops = &floppy_fops;
1166 disk->events = DISK_EVENT_MEDIA_CHANGE;
1164 disk->private_data = &floppy_states[i]; 1167 disk->private_data = &floppy_states[i];
1165 disk->queue = swim3_queue; 1168 disk->queue = swim3_queue;
1166 disk->flags |= GENHD_FL_REMOVABLE; 1169 disk->flags |= GENHD_FL_REMOVABLE;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 9ae3bb713286..68b9430c7cfe 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1788,7 +1788,8 @@ static int ub_bd_revalidate(struct gendisk *disk)
1788 * 1788 *
1789 * The return code is bool! 1789 * The return code is bool!
1790 */ 1790 */
1791static int ub_bd_media_changed(struct gendisk *disk) 1791static unsigned int ub_bd_check_events(struct gendisk *disk,
1792 unsigned int clearing)
1792{ 1793{
1793 struct ub_lun *lun = disk->private_data; 1794 struct ub_lun *lun = disk->private_data;
1794 1795
@@ -1806,10 +1807,10 @@ static int ub_bd_media_changed(struct gendisk *disk)
1806 */ 1807 */
1807 if (ub_sync_tur(lun->udev, lun) != 0) { 1808 if (ub_sync_tur(lun->udev, lun) != 0) {
1808 lun->changed = 1; 1809 lun->changed = 1;
1809 return 1; 1810 return DISK_EVENT_MEDIA_CHANGE;
1810 } 1811 }
1811 1812
1812 return lun->changed; 1813 return lun->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
1813} 1814}
1814 1815
1815static const struct block_device_operations ub_bd_fops = { 1816static const struct block_device_operations ub_bd_fops = {
@@ -1817,7 +1818,7 @@ static const struct block_device_operations ub_bd_fops = {
1817 .open = ub_bd_unlocked_open, 1818 .open = ub_bd_unlocked_open,
1818 .release = ub_bd_release, 1819 .release = ub_bd_release,
1819 .ioctl = ub_bd_ioctl, 1820 .ioctl = ub_bd_ioctl,
1820 .media_changed = ub_bd_media_changed, 1821 .check_events = ub_bd_check_events,
1821 .revalidate_disk = ub_bd_revalidate, 1822 .revalidate_disk = ub_bd_revalidate,
1822}; 1823};
1823 1824
@@ -2333,6 +2334,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
2333 disk->major = UB_MAJOR; 2334 disk->major = UB_MAJOR;
2334 disk->first_minor = lun->id * UB_PARTS_PER_LUN; 2335 disk->first_minor = lun->id * UB_PARTS_PER_LUN;
2335 disk->fops = &ub_bd_fops; 2336 disk->fops = &ub_bd_fops;
2337 disk->events = DISK_EVENT_MEDIA_CHANGE;
2336 disk->private_data = lun; 2338 disk->private_data = lun;
2337 disk->driverfs_dev = &sc->intf->dev; 2339 disk->driverfs_dev = &sc->intf->dev;
2338 2340
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8be57151f5d6..031ca720d926 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -241,8 +241,7 @@ static void dump_dmastat(struct cardinfo *card, unsigned int dmastat)
241 * 241 *
242 * Whenever IO on the active page completes, the Ready page is activated 242 * Whenever IO on the active page completes, the Ready page is activated
243 * and the ex-Active page is clean out and made Ready. 243 * and the ex-Active page is clean out and made Ready.
244 * Otherwise the Ready page is only activated when it becomes full, or 244 * Otherwise the Ready page is only activated when it becomes full.
245 * when mm_unplug_device is called via the unplug_io_fn.
246 * 245 *
247 * If a request arrives while both pages a full, it is queued, and b_rdev is 246 * If a request arrives while both pages a full, it is queued, and b_rdev is
248 * overloaded to record whether it was a read or a write. 247 * overloaded to record whether it was a read or a write.
@@ -333,17 +332,6 @@ static inline void reset_page(struct mm_page *page)
333 page->biotail = &page->bio; 332 page->biotail = &page->bio;
334} 333}
335 334
336static void mm_unplug_device(struct request_queue *q)
337{
338 struct cardinfo *card = q->queuedata;
339 unsigned long flags;
340
341 spin_lock_irqsave(&card->lock, flags);
342 if (blk_remove_plug(q))
343 activate(card);
344 spin_unlock_irqrestore(&card->lock, flags);
345}
346
347/* 335/*
348 * If there is room on Ready page, take 336 * If there is room on Ready page, take
349 * one bh off list and add it. 337 * one bh off list and add it.
@@ -535,7 +523,6 @@ static int mm_make_request(struct request_queue *q, struct bio *bio)
535 *card->biotail = bio; 523 *card->biotail = bio;
536 bio->bi_next = NULL; 524 bio->bi_next = NULL;
537 card->biotail = &bio->bi_next; 525 card->biotail = &bio->bi_next;
538 blk_plug_device(q);
539 spin_unlock_irq(&card->lock); 526 spin_unlock_irq(&card->lock);
540 527
541 return 0; 528 return 0;
@@ -779,20 +766,10 @@ static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
779 return 0; 766 return 0;
780} 767}
781 768
782/*
783 * Future support for removable devices
784 */
785static int mm_check_change(struct gendisk *disk)
786{
787/* struct cardinfo *dev = disk->private_data; */
788 return 0;
789}
790
791static const struct block_device_operations mm_fops = { 769static const struct block_device_operations mm_fops = {
792 .owner = THIS_MODULE, 770 .owner = THIS_MODULE,
793 .getgeo = mm_getgeo, 771 .getgeo = mm_getgeo,
794 .revalidate_disk = mm_revalidate, 772 .revalidate_disk = mm_revalidate,
795 .media_changed = mm_check_change,
796}; 773};
797 774
798static int __devinit mm_pci_probe(struct pci_dev *dev, 775static int __devinit mm_pci_probe(struct pci_dev *dev,
@@ -907,7 +884,6 @@ static int __devinit mm_pci_probe(struct pci_dev *dev,
907 blk_queue_make_request(card->queue, mm_make_request); 884 blk_queue_make_request(card->queue, mm_make_request);
908 card->queue->queue_lock = &card->lock; 885 card->queue->queue_lock = &card->lock;
909 card->queue->queuedata = card; 886 card->queue->queuedata = card;
910 card->queue->unplug_fn = mm_unplug_device;
911 887
912 tasklet_init(&card->tasklet, process_page, (unsigned long)card); 888 tasklet_init(&card->tasklet, process_page, (unsigned long)card);
913 889
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 2c590a796aa1..73354b081ed3 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -867,12 +867,12 @@ static void ace_request(struct request_queue * q)
867 } 867 }
868} 868}
869 869
870static int ace_media_changed(struct gendisk *gd) 870static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing)
871{ 871{
872 struct ace_device *ace = gd->private_data; 872 struct ace_device *ace = gd->private_data;
873 dev_dbg(ace->dev, "ace_media_changed(): %i\n", ace->media_change); 873 dev_dbg(ace->dev, "ace_check_events(): %i\n", ace->media_change);
874 874
875 return ace->media_change; 875 return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0;
876} 876}
877 877
878static int ace_revalidate_disk(struct gendisk *gd) 878static int ace_revalidate_disk(struct gendisk *gd)
@@ -953,7 +953,7 @@ static const struct block_device_operations ace_fops = {
953 .owner = THIS_MODULE, 953 .owner = THIS_MODULE,
954 .open = ace_open, 954 .open = ace_open,
955 .release = ace_release, 955 .release = ace_release,
956 .media_changed = ace_media_changed, 956 .check_events = ace_check_events,
957 .revalidate_disk = ace_revalidate_disk, 957 .revalidate_disk = ace_revalidate_disk,
958 .getgeo = ace_getgeo, 958 .getgeo = ace_getgeo,
959}; 959};
@@ -1005,6 +1005,7 @@ static int __devinit ace_setup(struct ace_device *ace)
1005 ace->gd->major = ace_major; 1005 ace->gd->major = ace_major;
1006 ace->gd->first_minor = ace->id * ACE_NUM_MINORS; 1006 ace->gd->first_minor = ace->id * ACE_NUM_MINORS;
1007 ace->gd->fops = &ace_fops; 1007 ace->gd->fops = &ace_fops;
1008 ace->gd->events = DISK_EVENT_MEDIA_CHANGE;
1008 ace->gd->queue = ace->queue; 1009 ace->gd->queue = ace->queue;
1009 ace->gd->private_data = ace; 1010 ace->gd->private_data = ace;
1010 snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a'); 1011 snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a');
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 64a21461c408..b2b034fea34e 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -395,10 +395,12 @@ static int gdrom_drivestatus(struct cdrom_device_info *cd_info, int ignore)
395 return CDS_NO_INFO; 395 return CDS_NO_INFO;
396} 396}
397 397
398static int gdrom_mediachanged(struct cdrom_device_info *cd_info, int ignore) 398static unsigned int gdrom_check_events(struct cdrom_device_info *cd_info,
399 unsigned int clearing, int ignore)
399{ 400{
400 /* check the sense key */ 401 /* check the sense key */
401 return (__raw_readb(GDROM_ERROR_REG) & 0xF0) == 0x60; 402 return (__raw_readb(GDROM_ERROR_REG) & 0xF0) == 0x60 ?
403 DISK_EVENT_MEDIA_CHANGE : 0;
402} 404}
403 405
404/* reset the G1 bus */ 406/* reset the G1 bus */
@@ -483,7 +485,7 @@ static struct cdrom_device_ops gdrom_ops = {
483 .open = gdrom_open, 485 .open = gdrom_open,
484 .release = gdrom_release, 486 .release = gdrom_release,
485 .drive_status = gdrom_drivestatus, 487 .drive_status = gdrom_drivestatus,
486 .media_changed = gdrom_mediachanged, 488 .check_events = gdrom_check_events,
487 .get_last_session = gdrom_get_last_session, 489 .get_last_session = gdrom_get_last_session,
488 .reset = gdrom_hardreset, 490 .reset = gdrom_hardreset,
489 .audio_ioctl = gdrom_audio_ioctl, 491 .audio_ioctl = gdrom_audio_ioctl,
@@ -509,9 +511,10 @@ static int gdrom_bdops_release(struct gendisk *disk, fmode_t mode)
509 return 0; 511 return 0;
510} 512}
511 513
512static int gdrom_bdops_mediachanged(struct gendisk *disk) 514static unsigned int gdrom_bdops_check_events(struct gendisk *disk,
515 unsigned int clearing)
513{ 516{
514 return cdrom_media_changed(gd.cd_info); 517 return cdrom_check_events(gd.cd_info, clearing);
515} 518}
516 519
517static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode, 520static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode,
@@ -530,7 +533,7 @@ static const struct block_device_operations gdrom_bdops = {
530 .owner = THIS_MODULE, 533 .owner = THIS_MODULE,
531 .open = gdrom_bdops_open, 534 .open = gdrom_bdops_open,
532 .release = gdrom_bdops_release, 535 .release = gdrom_bdops_release,
533 .media_changed = gdrom_bdops_mediachanged, 536 .check_events = gdrom_bdops_check_events,
534 .ioctl = gdrom_bdops_ioctl, 537 .ioctl = gdrom_bdops_ioctl,
535}; 538};
536 539
@@ -800,6 +803,7 @@ static int __devinit probe_gdrom(struct platform_device *devptr)
800 goto probe_fail_cdrom_register; 803 goto probe_fail_cdrom_register;
801 } 804 }
802 gd.disk->fops = &gdrom_bdops; 805 gd.disk->fops = &gdrom_bdops;
806 gd.disk->events = DISK_EVENT_MEDIA_CHANGE;
803 /* latch on to the interrupt */ 807 /* latch on to the interrupt */
804 err = gdrom_set_interrupt_handlers(); 808 err = gdrom_set_interrupt_handlers();
805 if (err) 809 if (err)
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index be73a9b493a6..4e874c5fa605 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -186,10 +186,11 @@ static int viocd_blk_ioctl(struct block_device *bdev, fmode_t mode,
186 return ret; 186 return ret;
187} 187}
188 188
189static int viocd_blk_media_changed(struct gendisk *disk) 189static unsigned int viocd_blk_check_events(struct gendisk *disk,
190 unsigned int clearing)
190{ 191{
191 struct disk_info *di = disk->private_data; 192 struct disk_info *di = disk->private_data;
192 return cdrom_media_changed(&di->viocd_info); 193 return cdrom_check_events(&di->viocd_info, clearing);
193} 194}
194 195
195static const struct block_device_operations viocd_fops = { 196static const struct block_device_operations viocd_fops = {
@@ -197,7 +198,7 @@ static const struct block_device_operations viocd_fops = {
197 .open = viocd_blk_open, 198 .open = viocd_blk_open,
198 .release = viocd_blk_release, 199 .release = viocd_blk_release,
199 .ioctl = viocd_blk_ioctl, 200 .ioctl = viocd_blk_ioctl,
200 .media_changed = viocd_blk_media_changed, 201 .check_events = viocd_blk_check_events,
201}; 202};
202 203
203static int viocd_open(struct cdrom_device_info *cdi, int purpose) 204static int viocd_open(struct cdrom_device_info *cdi, int purpose)
@@ -320,7 +321,8 @@ static void do_viocd_request(struct request_queue *q)
320 } 321 }
321} 322}
322 323
323static int viocd_media_changed(struct cdrom_device_info *cdi, int disc_nr) 324static unsigned int viocd_check_events(struct cdrom_device_info *cdi,
325 unsigned int clearing, int disc_nr)
324{ 326{
325 struct viocd_waitevent we; 327 struct viocd_waitevent we;
326 HvLpEvent_Rc hvrc; 328 HvLpEvent_Rc hvrc;
@@ -340,7 +342,7 @@ static int viocd_media_changed(struct cdrom_device_info *cdi, int disc_nr)
340 if (hvrc != 0) { 342 if (hvrc != 0) {
341 pr_warning("bad rc on HvCallEvent_signalLpEventFast %d\n", 343 pr_warning("bad rc on HvCallEvent_signalLpEventFast %d\n",
342 (int)hvrc); 344 (int)hvrc);
343 return -EIO; 345 return 0;
344 } 346 }
345 347
346 wait_for_completion(&we.com); 348 wait_for_completion(&we.com);
@@ -354,7 +356,7 @@ static int viocd_media_changed(struct cdrom_device_info *cdi, int disc_nr)
354 return 0; 356 return 0;
355 } 357 }
356 358
357 return we.changed; 359 return we.changed ? DISK_EVENT_MEDIA_CHANGE : 0;
358} 360}
359 361
360static int viocd_lock_door(struct cdrom_device_info *cdi, int locking) 362static int viocd_lock_door(struct cdrom_device_info *cdi, int locking)
@@ -550,7 +552,7 @@ static int viocd_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
550static struct cdrom_device_ops viocd_dops = { 552static struct cdrom_device_ops viocd_dops = {
551 .open = viocd_open, 553 .open = viocd_open,
552 .release = viocd_release, 554 .release = viocd_release,
553 .media_changed = viocd_media_changed, 555 .check_events = viocd_check_events,
554 .lock_door = viocd_lock_door, 556 .lock_door = viocd_lock_door,
555 .generic_packet = viocd_packet, 557 .generic_packet = viocd_packet,
556 .audio_ioctl = viocd_audio_ioctl, 558 .audio_ioctl = viocd_audio_ioctl,
@@ -624,6 +626,7 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
624 gendisk->queue = q; 626 gendisk->queue = q;
625 gendisk->fops = &viocd_fops; 627 gendisk->fops = &viocd_fops;
626 gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE; 628 gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE;
629 gendisk->events = DISK_EVENT_MEDIA_CHANGE;
627 set_capacity(gendisk, 0); 630 set_capacity(gendisk, 0);
628 gendisk->private_data = d; 631 gendisk->private_data = d;
629 d->viocd_disk = gendisk; 632 d->viocd_disk = gendisk;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index e88a2cf17711..6f218e014e99 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -233,8 +233,7 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
233 233
234 drive->hwif->rq = NULL; 234 drive->hwif->rq = NULL;
235 235
236 elv_add_request(drive->queue, &drive->sense_rq, 236 elv_add_request(drive->queue, &drive->sense_rq, ELEVATOR_INSERT_FRONT);
237 ELEVATOR_INSERT_FRONT, 0);
238 return 0; 237 return 0;
239} 238}
240EXPORT_SYMBOL_GPL(ide_queue_sense_rq); 239EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 0c73fe39a236..fd1e11799137 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -258,17 +258,10 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
258 if (time_after(jiffies, info->write_timeout)) 258 if (time_after(jiffies, info->write_timeout))
259 return 0; 259 return 0;
260 else { 260 else {
261 struct request_queue *q = drive->queue;
262 unsigned long flags;
263
264 /* 261 /*
265 * take a breather relying on the unplug timer to kick us again 262 * take a breather
266 */ 263 */
267 264 blk_delay_queue(drive->queue, 1);
268 spin_lock_irqsave(q->queue_lock, flags);
269 blk_plug_device(q);
270 spin_unlock_irqrestore(q->queue_lock, flags);
271
272 return 1; 265 return 1;
273 } 266 }
274} 267}
@@ -1177,7 +1170,7 @@ static struct cdrom_device_ops ide_cdrom_dops = {
1177 .open = ide_cdrom_open_real, 1170 .open = ide_cdrom_open_real,
1178 .release = ide_cdrom_release_real, 1171 .release = ide_cdrom_release_real,
1179 .drive_status = ide_cdrom_drive_status, 1172 .drive_status = ide_cdrom_drive_status,
1180 .media_changed = ide_cdrom_check_media_change_real, 1173 .check_events = ide_cdrom_check_events_real,
1181 .tray_move = ide_cdrom_tray_move, 1174 .tray_move = ide_cdrom_tray_move,
1182 .lock_door = ide_cdrom_lock_door, 1175 .lock_door = ide_cdrom_lock_door,
1183 .select_speed = ide_cdrom_select_speed, 1176 .select_speed = ide_cdrom_select_speed,
@@ -1514,8 +1507,6 @@ static int ide_cdrom_setup(ide_drive_t *drive)
1514 blk_queue_dma_alignment(q, 31); 1507 blk_queue_dma_alignment(q, 31);
1515 blk_queue_update_dma_pad(q, 15); 1508 blk_queue_update_dma_pad(q, 15);
1516 1509
1517 q->unplug_delay = max((1 * HZ) / 1000, 1);
1518
1519 drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED; 1510 drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
1520 drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id); 1511 drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id);
1521 1512
@@ -1702,10 +1693,11 @@ static int idecd_ioctl(struct block_device *bdev, fmode_t mode,
1702} 1693}
1703 1694
1704 1695
1705static int idecd_media_changed(struct gendisk *disk) 1696static unsigned int idecd_check_events(struct gendisk *disk,
1697 unsigned int clearing)
1706{ 1698{
1707 struct cdrom_info *info = ide_drv_g(disk, cdrom_info); 1699 struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
1708 return cdrom_media_changed(&info->devinfo); 1700 return cdrom_check_events(&info->devinfo, clearing);
1709} 1701}
1710 1702
1711static int idecd_revalidate_disk(struct gendisk *disk) 1703static int idecd_revalidate_disk(struct gendisk *disk)
@@ -1723,7 +1715,7 @@ static const struct block_device_operations idecd_ops = {
1723 .open = idecd_open, 1715 .open = idecd_open,
1724 .release = idecd_release, 1716 .release = idecd_release,
1725 .ioctl = idecd_ioctl, 1717 .ioctl = idecd_ioctl,
1726 .media_changed = idecd_media_changed, 1718 .check_events = idecd_check_events,
1727 .revalidate_disk = idecd_revalidate_disk 1719 .revalidate_disk = idecd_revalidate_disk
1728}; 1720};
1729 1721
@@ -1790,6 +1782,7 @@ static int ide_cd_probe(ide_drive_t *drive)
1790 ide_cd_read_toc(drive, &sense); 1782 ide_cd_read_toc(drive, &sense);
1791 g->fops = &idecd_ops; 1783 g->fops = &idecd_ops;
1792 g->flags |= GENHD_FL_REMOVABLE; 1784 g->flags |= GENHD_FL_REMOVABLE;
1785 g->events = DISK_EVENT_MEDIA_CHANGE;
1793 add_disk(g); 1786 add_disk(g);
1794 return 0; 1787 return 0;
1795 1788
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 93a3cf1b0f3f..1efc936f5b66 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -111,7 +111,8 @@ int cdrom_check_status(ide_drive_t *, struct request_sense *);
111int ide_cdrom_open_real(struct cdrom_device_info *, int); 111int ide_cdrom_open_real(struct cdrom_device_info *, int);
112void ide_cdrom_release_real(struct cdrom_device_info *); 112void ide_cdrom_release_real(struct cdrom_device_info *);
113int ide_cdrom_drive_status(struct cdrom_device_info *, int); 113int ide_cdrom_drive_status(struct cdrom_device_info *, int);
114int ide_cdrom_check_media_change_real(struct cdrom_device_info *, int); 114unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *,
115 unsigned int clearing, int slot_nr);
115int ide_cdrom_tray_move(struct cdrom_device_info *, int); 116int ide_cdrom_tray_move(struct cdrom_device_info *, int);
116int ide_cdrom_lock_door(struct cdrom_device_info *, int); 117int ide_cdrom_lock_door(struct cdrom_device_info *, int);
117int ide_cdrom_select_speed(struct cdrom_device_info *, int); 118int ide_cdrom_select_speed(struct cdrom_device_info *, int);
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 766b3deeb23c..2a6bc50e8a41 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -79,8 +79,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
79 return CDS_DRIVE_NOT_READY; 79 return CDS_DRIVE_NOT_READY;
80} 80}
81 81
82int ide_cdrom_check_media_change_real(struct cdrom_device_info *cdi, 82unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
83 int slot_nr) 83 unsigned int clearing, int slot_nr)
84{ 84{
85 ide_drive_t *drive = cdi->handle; 85 ide_drive_t *drive = cdi->handle;
86 int retval; 86 int retval;
@@ -89,9 +89,9 @@ int ide_cdrom_check_media_change_real(struct cdrom_device_info *cdi,
89 (void) cdrom_check_status(drive, NULL); 89 (void) cdrom_check_status(drive, NULL);
90 retval = (drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED) ? 1 : 0; 90 retval = (drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED) ? 1 : 0;
91 drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED; 91 drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
92 return retval; 92 return retval ? DISK_EVENT_MEDIA_CHANGE : 0;
93 } else { 93 } else {
94 return -EINVAL; 94 return 0;
95 } 95 }
96} 96}
97 97
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 35c4b43585e3..c4ffd4888939 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -285,11 +285,12 @@ static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
285 return 0; 285 return 0;
286} 286}
287 287
288static int ide_gd_media_changed(struct gendisk *disk) 288static unsigned int ide_gd_check_events(struct gendisk *disk,
289 unsigned int clearing)
289{ 290{
290 struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); 291 struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
291 ide_drive_t *drive = idkp->drive; 292 ide_drive_t *drive = idkp->drive;
292 int ret; 293 bool ret;
293 294
294 /* do not scan partitions twice if this is a removable device */ 295 /* do not scan partitions twice if this is a removable device */
295 if (drive->dev_flags & IDE_DFLAG_ATTACH) { 296 if (drive->dev_flags & IDE_DFLAG_ATTACH) {
@@ -297,10 +298,10 @@ static int ide_gd_media_changed(struct gendisk *disk)
297 return 0; 298 return 0;
298 } 299 }
299 300
300 ret = !!(drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED); 301 ret = drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED;
301 drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED; 302 drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
302 303
303 return ret; 304 return ret ? DISK_EVENT_MEDIA_CHANGE : 0;
304} 305}
305 306
306static void ide_gd_unlock_native_capacity(struct gendisk *disk) 307static void ide_gd_unlock_native_capacity(struct gendisk *disk)
@@ -318,7 +319,7 @@ static int ide_gd_revalidate_disk(struct gendisk *disk)
318 struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); 319 struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
319 ide_drive_t *drive = idkp->drive; 320 ide_drive_t *drive = idkp->drive;
320 321
321 if (ide_gd_media_changed(disk)) 322 if (ide_gd_check_events(disk, 0))
322 drive->disk_ops->get_capacity(drive); 323 drive->disk_ops->get_capacity(drive);
323 324
324 set_capacity(disk, ide_gd_capacity(drive)); 325 set_capacity(disk, ide_gd_capacity(drive));
@@ -340,7 +341,7 @@ static const struct block_device_operations ide_gd_ops = {
340 .release = ide_gd_release, 341 .release = ide_gd_release,
341 .ioctl = ide_gd_ioctl, 342 .ioctl = ide_gd_ioctl,
342 .getgeo = ide_gd_getgeo, 343 .getgeo = ide_gd_getgeo,
343 .media_changed = ide_gd_media_changed, 344 .check_events = ide_gd_check_events,
344 .unlock_native_capacity = ide_gd_unlock_native_capacity, 345 .unlock_native_capacity = ide_gd_unlock_native_capacity,
345 .revalidate_disk = ide_gd_revalidate_disk 346 .revalidate_disk = ide_gd_revalidate_disk
346}; 347};
@@ -412,6 +413,7 @@ static int ide_gd_probe(ide_drive_t *drive)
412 if (drive->dev_flags & IDE_DFLAG_REMOVABLE) 413 if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
413 g->flags = GENHD_FL_REMOVABLE; 414 g->flags = GENHD_FL_REMOVABLE;
414 g->fops = &ide_gd_ops; 415 g->fops = &ide_gd_ops;
416 g->events = DISK_EVENT_MEDIA_CHANGE;
415 add_disk(g); 417 add_disk(g);
416 return 0; 418 return 0;
417 419
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 999dac054bcc..f4077840d3ab 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -549,8 +549,6 @@ plug_device_2:
549 549
550 if (rq) 550 if (rq)
551 blk_requeue_request(q, rq); 551 blk_requeue_request(q, rq);
552 if (!elv_queue_empty(q))
553 blk_plug_device(q);
554} 552}
555 553
556void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq) 554void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
@@ -562,8 +560,6 @@ void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
562 560
563 if (rq) 561 if (rq)
564 blk_requeue_request(q, rq); 562 blk_requeue_request(q, rq);
565 if (!elv_queue_empty(q))
566 blk_plug_device(q);
567 563
568 spin_unlock_irqrestore(q->queue_lock, flags); 564 spin_unlock_irqrestore(q->queue_lock, flags);
569} 565}
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 88a380c5a470..6ab9ab2a5081 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -52,7 +52,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
52 rq->cmd[0] = REQ_UNPARK_HEADS; 52 rq->cmd[0] = REQ_UNPARK_HEADS;
53 rq->cmd_len = 1; 53 rq->cmd_len = 1;
54 rq->cmd_type = REQ_TYPE_SPECIAL; 54 rq->cmd_type = REQ_TYPE_SPECIAL;
55 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); 55 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
56 56
57out: 57out:
58 return; 58 return;
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a2ce0b2da281..5c9362792f1d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -347,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
347 atomic_inc(&bitmap->pending_writes); 347 atomic_inc(&bitmap->pending_writes);
348 set_buffer_locked(bh); 348 set_buffer_locked(bh);
349 set_buffer_mapped(bh); 349 set_buffer_mapped(bh);
350 submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh); 350 submit_bh(WRITE | REQ_SYNC, bh);
351 bh = bh->b_this_page; 351 bh = bh->b_this_page;
352 } 352 }
353 353
@@ -1339,8 +1339,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1339 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1339 prepare_to_wait(&bitmap->overflow_wait, &__wait,
1340 TASK_UNINTERRUPTIBLE); 1340 TASK_UNINTERRUPTIBLE);
1341 spin_unlock_irq(&bitmap->lock); 1341 spin_unlock_irq(&bitmap->lock);
1342 md_unplug(bitmap->mddev); 1342 io_schedule();
1343 schedule();
1344 finish_wait(&bitmap->overflow_wait, &__wait); 1343 finish_wait(&bitmap->overflow_wait, &__wait);
1345 continue; 1344 continue;
1346 } 1345 }
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4e054bd91664..2c62c1169f78 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -991,11 +991,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
991 clone->bi_destructor = dm_crypt_bio_destructor; 991 clone->bi_destructor = dm_crypt_bio_destructor;
992} 992}
993 993
994static void kcryptd_unplug(struct crypt_config *cc)
995{
996 blk_unplug(bdev_get_queue(cc->dev->bdev));
997}
998
999static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) 994static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
1000{ 995{
1001 struct crypt_config *cc = io->target->private; 996 struct crypt_config *cc = io->target->private;
@@ -1008,10 +1003,8 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
1008 * one in order to decrypt the whole bio data *afterwards*. 1003 * one in order to decrypt the whole bio data *afterwards*.
1009 */ 1004 */
1010 clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); 1005 clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
1011 if (!clone) { 1006 if (!clone)
1012 kcryptd_unplug(cc);
1013 return 1; 1007 return 1;
1014 }
1015 1008
1016 crypt_inc_pending(io); 1009 crypt_inc_pending(io);
1017 1010
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 136d4f71a116..76a5af00a26b 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -352,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
352 BUG_ON(num_regions > DM_IO_MAX_REGIONS); 352 BUG_ON(num_regions > DM_IO_MAX_REGIONS);
353 353
354 if (sync) 354 if (sync)
355 rw |= REQ_SYNC | REQ_UNPLUG; 355 rw |= REQ_SYNC;
356 356
357 /* 357 /*
358 * For multiple regions we need to be careful to rewind 358 * For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 924f5f0084c2..1bb73a13ca40 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,13 +37,6 @@ struct dm_kcopyd_client {
37 unsigned int nr_pages; 37 unsigned int nr_pages;
38 unsigned int nr_free_pages; 38 unsigned int nr_free_pages;
39 39
40 /*
41 * Block devices to unplug.
42 * Non-NULL pointer means that a block device has some pending requests
43 * and needs to be unplugged.
44 */
45 struct block_device *unplug[2];
46
47 struct dm_io_client *io_client; 40 struct dm_io_client *io_client;
48 41
49 wait_queue_head_t destroyq; 42 wait_queue_head_t destroyq;
@@ -315,31 +308,6 @@ static int run_complete_job(struct kcopyd_job *job)
315 return 0; 308 return 0;
316} 309}
317 310
318/*
319 * Unplug the block device at the specified index.
320 */
321static void unplug(struct dm_kcopyd_client *kc, int rw)
322{
323 if (kc->unplug[rw] != NULL) {
324 blk_unplug(bdev_get_queue(kc->unplug[rw]));
325 kc->unplug[rw] = NULL;
326 }
327}
328
329/*
330 * Prepare block device unplug. If there's another device
331 * to be unplugged at the same array index, we unplug that
332 * device first.
333 */
334static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
335 struct block_device *bdev)
336{
337 if (likely(kc->unplug[rw] == bdev))
338 return;
339 unplug(kc, rw);
340 kc->unplug[rw] = bdev;
341}
342
343static void complete_io(unsigned long error, void *context) 311static void complete_io(unsigned long error, void *context)
344{ 312{
345 struct kcopyd_job *job = (struct kcopyd_job *) context; 313 struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -386,16 +354,10 @@ static int run_io_job(struct kcopyd_job *job)
386 .client = job->kc->io_client, 354 .client = job->kc->io_client,
387 }; 355 };
388 356
389 if (job->rw == READ) { 357 if (job->rw == READ)
390 r = dm_io(&io_req, 1, &job->source, NULL); 358 r = dm_io(&io_req, 1, &job->source, NULL);
391 prepare_unplug(job->kc, READ, job->source.bdev); 359 else
392 } else {
393 if (job->num_dests > 1)
394 io_req.bi_rw |= REQ_UNPLUG;
395 r = dm_io(&io_req, job->num_dests, job->dests, NULL); 360 r = dm_io(&io_req, job->num_dests, job->dests, NULL);
396 if (!(io_req.bi_rw & REQ_UNPLUG))
397 prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
398 }
399 361
400 return r; 362 return r;
401} 363}
@@ -466,6 +428,7 @@ static void do_work(struct work_struct *work)
466{ 428{
467 struct dm_kcopyd_client *kc = container_of(work, 429 struct dm_kcopyd_client *kc = container_of(work,
468 struct dm_kcopyd_client, kcopyd_work); 430 struct dm_kcopyd_client, kcopyd_work);
431 struct blk_plug plug;
469 432
470 /* 433 /*
471 * The order that these are called is *very* important. 434 * The order that these are called is *very* important.
@@ -473,18 +436,12 @@ static void do_work(struct work_struct *work)
473 * Pages jobs when successful will jump onto the io jobs 436 * Pages jobs when successful will jump onto the io jobs
474 * list. io jobs call wake when they complete and it all 437 * list. io jobs call wake when they complete and it all
475 * starts again. 438 * starts again.
476 *
477 * Note that io_jobs add block devices to the unplug array,
478 * this array is cleared with "unplug" calls. It is thus
479 * forbidden to run complete_jobs after io_jobs and before
480 * unplug because the block device could be destroyed in
481 * job completion callback.
482 */ 439 */
440 blk_start_plug(&plug);
483 process_jobs(&kc->complete_jobs, kc, run_complete_job); 441 process_jobs(&kc->complete_jobs, kc, run_complete_job);
484 process_jobs(&kc->pages_jobs, kc, run_pages_job); 442 process_jobs(&kc->pages_jobs, kc, run_pages_job);
485 process_jobs(&kc->io_jobs, kc, run_io_job); 443 process_jobs(&kc->io_jobs, kc, run_io_job);
486 unplug(kc, READ); 444 blk_finish_plug(&plug);
487 unplug(kc, WRITE);
488} 445}
489 446
490/* 447/*
@@ -665,8 +622,6 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
665 INIT_LIST_HEAD(&kc->io_jobs); 622 INIT_LIST_HEAD(&kc->io_jobs);
666 INIT_LIST_HEAD(&kc->pages_jobs); 623 INIT_LIST_HEAD(&kc->pages_jobs);
667 624
668 memset(kc->unplug, 0, sizeof(kc->unplug));
669
670 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); 625 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
671 if (!kc->job_pool) 626 if (!kc->job_pool)
672 goto bad_slab; 627 goto bad_slab;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b9e1e15ef11c..5ef136cdba91 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -394,7 +394,7 @@ static void raid_unplug(struct dm_target_callbacks *cb)
394{ 394{
395 struct raid_set *rs = container_of(cb, struct raid_set, callbacks); 395 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
396 396
397 md_raid5_unplug_device(rs->md.private); 397 md_raid5_kick_device(rs->md.private);
398} 398}
399 399
400/* 400/*
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index dee326775c60..976ad4688afc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -842,8 +842,6 @@ static void do_mirror(struct work_struct *work)
842 do_reads(ms, &reads); 842 do_reads(ms, &reads);
843 do_writes(ms, &writes); 843 do_writes(ms, &writes);
844 do_failures(ms, &failures); 844 do_failures(ms, &failures);
845
846 dm_table_unplug_all(ms->ti->table);
847} 845}
848 846
849/*----------------------------------------------------------------- 847/*-----------------------------------------------------------------
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 38e4eb1bb965..416d4e258df6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -55,6 +55,7 @@ struct dm_table {
55 struct dm_target *targets; 55 struct dm_target *targets;
56 56
57 unsigned discards_supported:1; 57 unsigned discards_supported:1;
58 unsigned integrity_supported:1;
58 59
59 /* 60 /*
60 * Indicates the rw permissions for the new logical 61 * Indicates the rw permissions for the new logical
@@ -859,7 +860,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t)
859 return -EINVAL; 860 return -EINVAL;
860 } 861 }
861 862
862 t->mempools = dm_alloc_md_mempools(type); 863 t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
863 if (!t->mempools) 864 if (!t->mempools)
864 return -ENOMEM; 865 return -ENOMEM;
865 866
@@ -935,8 +936,10 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device
935 struct dm_dev_internal *dd; 936 struct dm_dev_internal *dd;
936 937
937 list_for_each_entry(dd, devices, list) 938 list_for_each_entry(dd, devices, list)
938 if (bdev_get_integrity(dd->dm_dev.bdev)) 939 if (bdev_get_integrity(dd->dm_dev.bdev)) {
940 t->integrity_supported = 1;
939 return blk_integrity_register(dm_disk(md), NULL); 941 return blk_integrity_register(dm_disk(md), NULL);
942 }
940 943
941 return 0; 944 return 0;
942} 945}
@@ -1275,29 +1278,6 @@ int dm_table_any_busy_target(struct dm_table *t)
1275 return 0; 1278 return 0;
1276} 1279}
1277 1280
1278void dm_table_unplug_all(struct dm_table *t)
1279{
1280 struct dm_dev_internal *dd;
1281 struct list_head *devices = dm_table_get_devices(t);
1282 struct dm_target_callbacks *cb;
1283
1284 list_for_each_entry(dd, devices, list) {
1285 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
1286 char b[BDEVNAME_SIZE];
1287
1288 if (likely(q))
1289 blk_unplug(q);
1290 else
1291 DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s",
1292 dm_device_name(t->md),
1293 bdevname(dd->dm_dev.bdev, b));
1294 }
1295
1296 list_for_each_entry(cb, &t->target_callbacks, list)
1297 if (cb->unplug_fn)
1298 cb->unplug_fn(cb);
1299}
1300
1301struct mapped_device *dm_table_get_md(struct dm_table *t) 1281struct mapped_device *dm_table_get_md(struct dm_table *t)
1302{ 1282{
1303 return t->md; 1283 return t->md;
@@ -1345,4 +1325,3 @@ EXPORT_SYMBOL(dm_table_get_mode);
1345EXPORT_SYMBOL(dm_table_get_md); 1325EXPORT_SYMBOL(dm_table_get_md);
1346EXPORT_SYMBOL(dm_table_put); 1326EXPORT_SYMBOL(dm_table_put);
1347EXPORT_SYMBOL(dm_table_get); 1327EXPORT_SYMBOL(dm_table_get);
1348EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eaa3af0e0632..0cf68b478878 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -477,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
477 cpu = part_stat_lock(); 477 cpu = part_stat_lock();
478 part_round_stats(cpu, &dm_disk(md)->part0); 478 part_round_stats(cpu, &dm_disk(md)->part0);
479 part_stat_unlock(); 479 part_stat_unlock();
480 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 480 atomic_set(&dm_disk(md)->part0.in_flight[rw],
481 atomic_inc_return(&md->pending[rw]));
481} 482}
482 483
483static void end_io_acct(struct dm_io *io) 484static void end_io_acct(struct dm_io *io)
@@ -497,8 +498,8 @@ static void end_io_acct(struct dm_io *io)
497 * After this is decremented the bio must not be touched if it is 498 * After this is decremented the bio must not be touched if it is
498 * a flush. 499 * a flush.
499 */ 500 */
500 dm_disk(md)->part0.in_flight[rw] = pending = 501 pending = atomic_dec_return(&md->pending[rw]);
501 atomic_dec_return(&md->pending[rw]); 502 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
502 pending += atomic_read(&md->pending[rw^0x1]); 503 pending += atomic_read(&md->pending[rw^0x1]);
503 504
504 /* nudge anyone waiting on suspend queue */ 505 /* nudge anyone waiting on suspend queue */
@@ -807,8 +808,6 @@ void dm_requeue_unmapped_request(struct request *clone)
807 dm_unprep_request(rq); 808 dm_unprep_request(rq);
808 809
809 spin_lock_irqsave(q->queue_lock, flags); 810 spin_lock_irqsave(q->queue_lock, flags);
810 if (elv_queue_empty(q))
811 blk_plug_device(q);
812 blk_requeue_request(q, rq); 811 blk_requeue_request(q, rq);
813 spin_unlock_irqrestore(q->queue_lock, flags); 812 spin_unlock_irqrestore(q->queue_lock, flags);
814 813
@@ -1613,10 +1612,10 @@ static void dm_request_fn(struct request_queue *q)
1613 * number of in-flight I/Os after the queue is stopped in 1612 * number of in-flight I/Os after the queue is stopped in
1614 * dm_suspend(). 1613 * dm_suspend().
1615 */ 1614 */
1616 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1615 while (!blk_queue_stopped(q)) {
1617 rq = blk_peek_request(q); 1616 rq = blk_peek_request(q);
1618 if (!rq) 1617 if (!rq)
1619 goto plug_and_out; 1618 goto delay_and_out;
1620 1619
1621 /* always use block 0 to find the target for flushes for now */ 1620 /* always use block 0 to find the target for flushes for now */
1622 pos = 0; 1621 pos = 0;
@@ -1627,7 +1626,7 @@ static void dm_request_fn(struct request_queue *q)
1627 BUG_ON(!dm_target_is_valid(ti)); 1626 BUG_ON(!dm_target_is_valid(ti));
1628 1627
1629 if (ti->type->busy && ti->type->busy(ti)) 1628 if (ti->type->busy && ti->type->busy(ti))
1630 goto plug_and_out; 1629 goto delay_and_out;
1631 1630
1632 blk_start_request(rq); 1631 blk_start_request(rq);
1633 clone = rq->special; 1632 clone = rq->special;
@@ -1647,11 +1646,8 @@ requeued:
1647 BUG_ON(!irqs_disabled()); 1646 BUG_ON(!irqs_disabled());
1648 spin_lock(q->queue_lock); 1647 spin_lock(q->queue_lock);
1649 1648
1650plug_and_out: 1649delay_and_out:
1651 if (!elv_queue_empty(q)) 1650 blk_delay_queue(q, HZ / 10);
1652 /* Some requests still remain, retry later */
1653 blk_plug_device(q);
1654
1655out: 1651out:
1656 dm_table_put(map); 1652 dm_table_put(map);
1657 1653
@@ -1680,20 +1676,6 @@ static int dm_lld_busy(struct request_queue *q)
1680 return r; 1676 return r;
1681} 1677}
1682 1678
1683static void dm_unplug_all(struct request_queue *q)
1684{
1685 struct mapped_device *md = q->queuedata;
1686 struct dm_table *map = dm_get_live_table(md);
1687
1688 if (map) {
1689 if (dm_request_based(md))
1690 generic_unplug_device(q);
1691
1692 dm_table_unplug_all(map);
1693 dm_table_put(map);
1694 }
1695}
1696
1697static int dm_any_congested(void *congested_data, int bdi_bits) 1679static int dm_any_congested(void *congested_data, int bdi_bits)
1698{ 1680{
1699 int r = bdi_bits; 1681 int r = bdi_bits;
@@ -1817,7 +1799,6 @@ static void dm_init_md_queue(struct mapped_device *md)
1817 md->queue->backing_dev_info.congested_data = md; 1799 md->queue->backing_dev_info.congested_data = md;
1818 blk_queue_make_request(md->queue, dm_request); 1800 blk_queue_make_request(md->queue, dm_request);
1819 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1801 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1820 md->queue->unplug_fn = dm_unplug_all;
1821 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1802 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1822 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); 1803 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1823} 1804}
@@ -2263,8 +2244,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2263 int r = 0; 2244 int r = 0;
2264 DECLARE_WAITQUEUE(wait, current); 2245 DECLARE_WAITQUEUE(wait, current);
2265 2246
2266 dm_unplug_all(md->queue);
2267
2268 add_wait_queue(&md->wait, &wait); 2247 add_wait_queue(&md->wait, &wait);
2269 2248
2270 while (1) { 2249 while (1) {
@@ -2539,7 +2518,6 @@ int dm_resume(struct mapped_device *md)
2539 2518
2540 clear_bit(DMF_SUSPENDED, &md->flags); 2519 clear_bit(DMF_SUSPENDED, &md->flags);
2541 2520
2542 dm_table_unplug_all(map);
2543 r = 0; 2521 r = 0;
2544out: 2522out:
2545 dm_table_put(map); 2523 dm_table_put(map);
@@ -2643,9 +2621,10 @@ int dm_noflush_suspending(struct dm_target *ti)
2643} 2621}
2644EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2622EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2645 2623
2646struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2624struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2647{ 2625{
2648 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2626 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2627 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
2649 2628
2650 if (!pools) 2629 if (!pools)
2651 return NULL; 2630 return NULL;
@@ -2662,13 +2641,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2662 if (!pools->tio_pool) 2641 if (!pools->tio_pool)
2663 goto free_io_pool_and_out; 2642 goto free_io_pool_and_out;
2664 2643
2665 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2644 pools->bs = bioset_create(pool_size, 0);
2666 bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2667 if (!pools->bs) 2645 if (!pools->bs)
2668 goto free_tio_pool_and_out; 2646 goto free_tio_pool_and_out;
2669 2647
2648 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2649 goto free_bioset_and_out;
2650
2670 return pools; 2651 return pools;
2671 2652
2653free_bioset_and_out:
2654 bioset_free(pools->bs);
2655
2672free_tio_pool_and_out: 2656free_tio_pool_and_out:
2673 mempool_destroy(pools->tio_pool); 2657 mempool_destroy(pools->tio_pool);
2674 2658
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0c2dd5f4af76..1aaf16746da8 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void);
149/* 149/*
150 * Mempool operations 150 * Mempool operations
151 */ 151 */
152struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); 152struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
153void dm_free_md_mempools(struct dm_md_mempools *pools); 153void dm_free_md_mempools(struct dm_md_mempools *pools);
154 154
155#endif 155#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 0ed7f6bc2a7f..abfb59a61ede 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
87 return maxsectors << 9; 87 return maxsectors << 9;
88} 88}
89 89
90static void linear_unplug(struct request_queue *q)
91{
92 mddev_t *mddev = q->queuedata;
93 linear_conf_t *conf;
94 int i;
95
96 rcu_read_lock();
97 conf = rcu_dereference(mddev->private);
98
99 for (i=0; i < mddev->raid_disks; i++) {
100 struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
101 blk_unplug(r_queue);
102 }
103 rcu_read_unlock();
104}
105
106static int linear_congested(void *data, int bits) 90static int linear_congested(void *data, int bits)
107{ 91{
108 mddev_t *mddev = data; 92 mddev_t *mddev = data;
@@ -224,11 +208,9 @@ static int linear_run (mddev_t *mddev)
224 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 208 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
225 209
226 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 210 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
227 mddev->queue->unplug_fn = linear_unplug;
228 mddev->queue->backing_dev_info.congested_fn = linear_congested; 211 mddev->queue->backing_dev_info.congested_fn = linear_congested;
229 mddev->queue->backing_dev_info.congested_data = mddev; 212 mddev->queue->backing_dev_info.congested_data = mddev;
230 md_integrity_register(mddev); 213 return md_integrity_register(mddev);
231 return 0;
232} 214}
233 215
234static void free_conf(struct rcu_head *head) 216static void free_conf(struct rcu_head *head)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d5ad7723b172..06ecea751a39 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -780,8 +780,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
780 bio->bi_end_io = super_written; 780 bio->bi_end_io = super_written;
781 781
782 atomic_inc(&mddev->pending_writes); 782 atomic_inc(&mddev->pending_writes);
783 submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, 783 submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
784 bio);
785} 784}
786 785
787void md_super_wait(mddev_t *mddev) 786void md_super_wait(mddev_t *mddev)
@@ -809,7 +808,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
809 struct completion event; 808 struct completion event;
810 int ret; 809 int ret;
811 810
812 rw |= REQ_SYNC | REQ_UNPLUG; 811 rw |= REQ_SYNC;
813 812
814 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 813 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
815 rdev->meta_bdev : rdev->bdev; 814 rdev->meta_bdev : rdev->bdev;
@@ -1804,8 +1803,12 @@ int md_integrity_register(mddev_t *mddev)
1804 mdname(mddev)); 1803 mdname(mddev));
1805 return -EINVAL; 1804 return -EINVAL;
1806 } 1805 }
1807 printk(KERN_NOTICE "md: data integrity on %s enabled\n", 1806 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
1808 mdname(mddev)); 1807 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
1808 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
1809 mdname(mddev));
1810 return -EINVAL;
1811 }
1809 return 0; 1812 return 0;
1810} 1813}
1811EXPORT_SYMBOL(md_integrity_register); 1814EXPORT_SYMBOL(md_integrity_register);
@@ -4817,7 +4820,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4817 __md_stop_writes(mddev); 4820 __md_stop_writes(mddev);
4818 md_stop(mddev); 4821 md_stop(mddev);
4819 mddev->queue->merge_bvec_fn = NULL; 4822 mddev->queue->merge_bvec_fn = NULL;
4820 mddev->queue->unplug_fn = NULL;
4821 mddev->queue->backing_dev_info.congested_fn = NULL; 4823 mddev->queue->backing_dev_info.congested_fn = NULL;
4822 4824
4823 /* tell userspace to handle 'inactive' */ 4825 /* tell userspace to handle 'inactive' */
@@ -6692,8 +6694,6 @@ EXPORT_SYMBOL_GPL(md_allow_write);
6692 6694
6693void md_unplug(mddev_t *mddev) 6695void md_unplug(mddev_t *mddev)
6694{ 6696{
6695 if (mddev->queue)
6696 blk_unplug(mddev->queue);
6697 if (mddev->plug) 6697 if (mddev->plug)
6698 mddev->plug->unplug_fn(mddev->plug); 6698 mddev->plug->unplug_fn(mddev->plug);
6699} 6699}
@@ -6876,7 +6876,6 @@ void md_do_sync(mddev_t *mddev)
6876 >= mddev->resync_max - mddev->curr_resync_completed 6876 >= mddev->resync_max - mddev->curr_resync_completed
6877 )) { 6877 )) {
6878 /* time to update curr_resync_completed */ 6878 /* time to update curr_resync_completed */
6879 md_unplug(mddev);
6880 wait_event(mddev->recovery_wait, 6879 wait_event(mddev->recovery_wait,
6881 atomic_read(&mddev->recovery_active) == 0); 6880 atomic_read(&mddev->recovery_active) == 0);
6882 mddev->curr_resync_completed = j; 6881 mddev->curr_resync_completed = j;
@@ -6952,7 +6951,6 @@ void md_do_sync(mddev_t *mddev)
6952 * about not overloading the IO subsystem. (things like an 6951 * about not overloading the IO subsystem. (things like an
6953 * e2fsck being done on the RAID array should execute fast) 6952 * e2fsck being done on the RAID array should execute fast)
6954 */ 6953 */
6955 md_unplug(mddev);
6956 cond_resched(); 6954 cond_resched();
6957 6955
6958 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6956 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6971,8 +6969,6 @@ void md_do_sync(mddev_t *mddev)
6971 * this also signals 'finished resyncing' to md_stop 6969 * this also signals 'finished resyncing' to md_stop
6972 */ 6970 */
6973 out: 6971 out:
6974 md_unplug(mddev);
6975
6976 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6972 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6977 6973
6978 /* tell personality that we are finished */ 6974 /* tell personality that we are finished */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3a62d440e27b..c35890990985 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,36 +106,6 @@ static void multipath_end_request(struct bio *bio, int error)
106 rdev_dec_pending(rdev, conf->mddev); 106 rdev_dec_pending(rdev, conf->mddev);
107} 107}
108 108
109static void unplug_slaves(mddev_t *mddev)
110{
111 multipath_conf_t *conf = mddev->private;
112 int i;
113
114 rcu_read_lock();
115 for (i=0; i<mddev->raid_disks; i++) {
116 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
117 if (rdev && !test_bit(Faulty, &rdev->flags)
118 && atomic_read(&rdev->nr_pending)) {
119 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
120
121 atomic_inc(&rdev->nr_pending);
122 rcu_read_unlock();
123
124 blk_unplug(r_queue);
125
126 rdev_dec_pending(rdev, mddev);
127 rcu_read_lock();
128 }
129 }
130 rcu_read_unlock();
131}
132
133static void multipath_unplug(struct request_queue *q)
134{
135 unplug_slaves(q->queuedata);
136}
137
138
139static int multipath_make_request(mddev_t *mddev, struct bio * bio) 109static int multipath_make_request(mddev_t *mddev, struct bio * bio)
140{ 110{
141 multipath_conf_t *conf = mddev->private; 111 multipath_conf_t *conf = mddev->private;
@@ -345,7 +315,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
345 p->rdev = rdev; 315 p->rdev = rdev;
346 goto abort; 316 goto abort;
347 } 317 }
348 md_integrity_register(mddev); 318 err = md_integrity_register(mddev);
349 } 319 }
350abort: 320abort:
351 321
@@ -517,10 +487,12 @@ static int multipath_run (mddev_t *mddev)
517 */ 487 */
518 md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); 488 md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
519 489
520 mddev->queue->unplug_fn = multipath_unplug;
521 mddev->queue->backing_dev_info.congested_fn = multipath_congested; 490 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
522 mddev->queue->backing_dev_info.congested_data = mddev; 491 mddev->queue->backing_dev_info.congested_data = mddev;
523 md_integrity_register(mddev); 492
493 if (md_integrity_register(mddev))
494 goto out_free_conf;
495
524 return 0; 496 return 0;
525 497
526out_free_conf: 498out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c0ac457f1218..e86bf3682e1e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,21 +25,6 @@
25#include "raid0.h" 25#include "raid0.h"
26#include "raid5.h" 26#include "raid5.h"
27 27
28static void raid0_unplug(struct request_queue *q)
29{
30 mddev_t *mddev = q->queuedata;
31 raid0_conf_t *conf = mddev->private;
32 mdk_rdev_t **devlist = conf->devlist;
33 int raid_disks = conf->strip_zone[0].nb_dev;
34 int i;
35
36 for (i=0; i < raid_disks; i++) {
37 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
38
39 blk_unplug(r_queue);
40 }
41}
42
43static int raid0_congested(void *data, int bits) 28static int raid0_congested(void *data, int bits)
44{ 29{
45 mddev_t *mddev = data; 30 mddev_t *mddev = data;
@@ -272,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
272 mdname(mddev), 257 mdname(mddev),
273 (unsigned long long)smallest->sectors); 258 (unsigned long long)smallest->sectors);
274 } 259 }
275 mddev->queue->unplug_fn = raid0_unplug;
276 mddev->queue->backing_dev_info.congested_fn = raid0_congested; 260 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
277 mddev->queue->backing_dev_info.congested_data = mddev; 261 mddev->queue->backing_dev_info.congested_data = mddev;
278 262
@@ -395,8 +379,7 @@ static int raid0_run(mddev_t *mddev)
395 379
396 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); 380 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
397 dump_zones(mddev); 381 dump_zones(mddev);
398 md_integrity_register(mddev); 382 return md_integrity_register(mddev);
399 return 0;
400} 383}
401 384
402static int raid0_stop(mddev_t *mddev) 385static int raid0_stop(mddev_t *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 06cd712807d0..c2a21ae56d97 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
52#define NR_RAID1_BIOS 256 52#define NR_RAID1_BIOS 256
53 53
54 54
55static void unplug_slaves(mddev_t *mddev);
56
57static void allow_barrier(conf_t *conf); 55static void allow_barrier(conf_t *conf);
58static void lower_barrier(conf_t *conf); 56static void lower_barrier(conf_t *conf);
59 57
60static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 58static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
61{ 59{
62 struct pool_info *pi = data; 60 struct pool_info *pi = data;
63 r1bio_t *r1_bio;
64 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 61 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
65 62
66 /* allocate a r1bio with room for raid_disks entries in the bios array */ 63 /* allocate a r1bio with room for raid_disks entries in the bios array */
67 r1_bio = kzalloc(size, gfp_flags); 64 return kzalloc(size, gfp_flags);
68 if (!r1_bio && pi->mddev)
69 unplug_slaves(pi->mddev);
70
71 return r1_bio;
72} 65}
73 66
74static void r1bio_pool_free(void *r1_bio, void *data) 67static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,10 +84,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
91 int i, j; 84 int i, j;
92 85
93 r1_bio = r1bio_pool_alloc(gfp_flags, pi); 86 r1_bio = r1bio_pool_alloc(gfp_flags, pi);
94 if (!r1_bio) { 87 if (!r1_bio)
95 unplug_slaves(pi->mddev);
96 return NULL; 88 return NULL;
97 }
98 89
99 /* 90 /*
100 * Allocate bios : 1 for reading, n-1 for writing 91 * Allocate bios : 1 for reading, n-1 for writing
@@ -520,37 +511,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
520 return new_disk; 511 return new_disk;
521} 512}
522 513
523static void unplug_slaves(mddev_t *mddev)
524{
525 conf_t *conf = mddev->private;
526 int i;
527
528 rcu_read_lock();
529 for (i=0; i<mddev->raid_disks; i++) {
530 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
531 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
532 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
533
534 atomic_inc(&rdev->nr_pending);
535 rcu_read_unlock();
536
537 blk_unplug(r_queue);
538
539 rdev_dec_pending(rdev, mddev);
540 rcu_read_lock();
541 }
542 }
543 rcu_read_unlock();
544}
545
546static void raid1_unplug(struct request_queue *q)
547{
548 mddev_t *mddev = q->queuedata;
549
550 unplug_slaves(mddev);
551 md_wakeup_thread(mddev->thread);
552}
553
554static int raid1_congested(void *data, int bits) 514static int raid1_congested(void *data, int bits)
555{ 515{
556 mddev_t *mddev = data; 516 mddev_t *mddev = data;
@@ -580,23 +540,16 @@ static int raid1_congested(void *data, int bits)
580} 540}
581 541
582 542
583static int flush_pending_writes(conf_t *conf) 543static void flush_pending_writes(conf_t *conf)
584{ 544{
585 /* Any writes that have been queued but are awaiting 545 /* Any writes that have been queued but are awaiting
586 * bitmap updates get flushed here. 546 * bitmap updates get flushed here.
587 * We return 1 if any requests were actually submitted.
588 */ 547 */
589 int rv = 0;
590
591 spin_lock_irq(&conf->device_lock); 548 spin_lock_irq(&conf->device_lock);
592 549
593 if (conf->pending_bio_list.head) { 550 if (conf->pending_bio_list.head) {
594 struct bio *bio; 551 struct bio *bio;
595 bio = bio_list_get(&conf->pending_bio_list); 552 bio = bio_list_get(&conf->pending_bio_list);
596 /* Only take the spinlock to quiet a warning */
597 spin_lock(conf->mddev->queue->queue_lock);
598 blk_remove_plug(conf->mddev->queue);
599 spin_unlock(conf->mddev->queue->queue_lock);
600 spin_unlock_irq(&conf->device_lock); 553 spin_unlock_irq(&conf->device_lock);
601 /* flush any pending bitmap writes to 554 /* flush any pending bitmap writes to
602 * disk before proceeding w/ I/O */ 555 * disk before proceeding w/ I/O */
@@ -608,10 +561,14 @@ static int flush_pending_writes(conf_t *conf)
608 generic_make_request(bio); 561 generic_make_request(bio);
609 bio = next; 562 bio = next;
610 } 563 }
611 rv = 1;
612 } else 564 } else
613 spin_unlock_irq(&conf->device_lock); 565 spin_unlock_irq(&conf->device_lock);
614 return rv; 566}
567
568static void md_kick_device(mddev_t *mddev)
569{
570 blk_flush_plug(current);
571 md_wakeup_thread(mddev->thread);
615} 572}
616 573
617/* Barriers.... 574/* Barriers....
@@ -643,8 +600,7 @@ static void raise_barrier(conf_t *conf)
643 600
644 /* Wait until no block IO is waiting */ 601 /* Wait until no block IO is waiting */
645 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 602 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
646 conf->resync_lock, 603 conf->resync_lock, md_kick_device(conf->mddev));
647 raid1_unplug(conf->mddev->queue));
648 604
649 /* block any new IO from starting */ 605 /* block any new IO from starting */
650 conf->barrier++; 606 conf->barrier++;
@@ -652,8 +608,7 @@ static void raise_barrier(conf_t *conf)
652 /* Now wait for all pending IO to complete */ 608 /* Now wait for all pending IO to complete */
653 wait_event_lock_irq(conf->wait_barrier, 609 wait_event_lock_irq(conf->wait_barrier,
654 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 610 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
655 conf->resync_lock, 611 conf->resync_lock, md_kick_device(conf->mddev));
656 raid1_unplug(conf->mddev->queue));
657 612
658 spin_unlock_irq(&conf->resync_lock); 613 spin_unlock_irq(&conf->resync_lock);
659} 614}
@@ -675,7 +630,7 @@ static void wait_barrier(conf_t *conf)
675 conf->nr_waiting++; 630 conf->nr_waiting++;
676 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 631 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
677 conf->resync_lock, 632 conf->resync_lock,
678 raid1_unplug(conf->mddev->queue)); 633 md_kick_device(conf->mddev));
679 conf->nr_waiting--; 634 conf->nr_waiting--;
680 } 635 }
681 conf->nr_pending++; 636 conf->nr_pending++;
@@ -712,7 +667,7 @@ static void freeze_array(conf_t *conf)
712 conf->nr_pending == conf->nr_queued+1, 667 conf->nr_pending == conf->nr_queued+1,
713 conf->resync_lock, 668 conf->resync_lock,
714 ({ flush_pending_writes(conf); 669 ({ flush_pending_writes(conf);
715 raid1_unplug(conf->mddev->queue); })); 670 md_kick_device(conf->mddev); }));
716 spin_unlock_irq(&conf->resync_lock); 671 spin_unlock_irq(&conf->resync_lock);
717} 672}
718static void unfreeze_array(conf_t *conf) 673static void unfreeze_array(conf_t *conf)
@@ -962,7 +917,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
962 atomic_inc(&r1_bio->remaining); 917 atomic_inc(&r1_bio->remaining);
963 spin_lock_irqsave(&conf->device_lock, flags); 918 spin_lock_irqsave(&conf->device_lock, flags);
964 bio_list_add(&conf->pending_bio_list, mbio); 919 bio_list_add(&conf->pending_bio_list, mbio);
965 blk_plug_device_unlocked(mddev->queue);
966 spin_unlock_irqrestore(&conf->device_lock, flags); 920 spin_unlock_irqrestore(&conf->device_lock, flags);
967 } 921 }
968 r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); 922 r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
@@ -971,7 +925,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
971 /* In case raid1d snuck in to freeze_array */ 925 /* In case raid1d snuck in to freeze_array */
972 wake_up(&conf->wait_barrier); 926 wake_up(&conf->wait_barrier);
973 927
974 if (do_sync) 928 if (do_sync || !bitmap)
975 md_wakeup_thread(mddev->thread); 929 md_wakeup_thread(mddev->thread);
976 930
977 return 0; 931 return 0;
@@ -1178,7 +1132,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1178 p->rdev = rdev; 1132 p->rdev = rdev;
1179 goto abort; 1133 goto abort;
1180 } 1134 }
1181 md_integrity_register(mddev); 1135 err = md_integrity_register(mddev);
1182 } 1136 }
1183abort: 1137abort:
1184 1138
@@ -1561,7 +1515,6 @@ static void raid1d(mddev_t *mddev)
1561 unsigned long flags; 1515 unsigned long flags;
1562 conf_t *conf = mddev->private; 1516 conf_t *conf = mddev->private;
1563 struct list_head *head = &conf->retry_list; 1517 struct list_head *head = &conf->retry_list;
1564 int unplug=0;
1565 mdk_rdev_t *rdev; 1518 mdk_rdev_t *rdev;
1566 1519
1567 md_check_recovery(mddev); 1520 md_check_recovery(mddev);
@@ -1569,7 +1522,7 @@ static void raid1d(mddev_t *mddev)
1569 for (;;) { 1522 for (;;) {
1570 char b[BDEVNAME_SIZE]; 1523 char b[BDEVNAME_SIZE];
1571 1524
1572 unplug += flush_pending_writes(conf); 1525 flush_pending_writes(conf);
1573 1526
1574 spin_lock_irqsave(&conf->device_lock, flags); 1527 spin_lock_irqsave(&conf->device_lock, flags);
1575 if (list_empty(head)) { 1528 if (list_empty(head)) {
@@ -1583,10 +1536,9 @@ static void raid1d(mddev_t *mddev)
1583 1536
1584 mddev = r1_bio->mddev; 1537 mddev = r1_bio->mddev;
1585 conf = mddev->private; 1538 conf = mddev->private;
1586 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1539 if (test_bit(R1BIO_IsSync, &r1_bio->state))
1587 sync_request_write(mddev, r1_bio); 1540 sync_request_write(mddev, r1_bio);
1588 unplug = 1; 1541 else {
1589 } else {
1590 int disk; 1542 int disk;
1591 1543
1592 /* we got a read error. Maybe the drive is bad. Maybe just 1544 /* we got a read error. Maybe the drive is bad. Maybe just
@@ -1636,14 +1588,11 @@ static void raid1d(mddev_t *mddev)
1636 bio->bi_end_io = raid1_end_read_request; 1588 bio->bi_end_io = raid1_end_read_request;
1637 bio->bi_rw = READ | do_sync; 1589 bio->bi_rw = READ | do_sync;
1638 bio->bi_private = r1_bio; 1590 bio->bi_private = r1_bio;
1639 unplug = 1;
1640 generic_make_request(bio); 1591 generic_make_request(bio);
1641 } 1592 }
1642 } 1593 }
1643 cond_resched(); 1594 cond_resched();
1644 } 1595 }
1645 if (unplug)
1646 unplug_slaves(mddev);
1647} 1596}
1648 1597
1649 1598
@@ -2066,11 +2015,9 @@ static int run(mddev_t *mddev)
2066 2015
2067 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 2016 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2068 2017
2069 mddev->queue->unplug_fn = raid1_unplug;
2070 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2018 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2071 mddev->queue->backing_dev_info.congested_data = mddev; 2019 mddev->queue->backing_dev_info.congested_data = mddev;
2072 md_integrity_register(mddev); 2020 return md_integrity_register(mddev);
2073 return 0;
2074} 2021}
2075 2022
2076static int stop(mddev_t *mddev) 2023static int stop(mddev_t *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 747d061d8e05..f7b62370b374 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -57,23 +57,16 @@
57 */ 57 */
58#define NR_RAID10_BIOS 256 58#define NR_RAID10_BIOS 256
59 59
60static void unplug_slaves(mddev_t *mddev);
61
62static void allow_barrier(conf_t *conf); 60static void allow_barrier(conf_t *conf);
63static void lower_barrier(conf_t *conf); 61static void lower_barrier(conf_t *conf);
64 62
65static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 63static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
66{ 64{
67 conf_t *conf = data; 65 conf_t *conf = data;
68 r10bio_t *r10_bio;
69 int size = offsetof(struct r10bio_s, devs[conf->copies]); 66 int size = offsetof(struct r10bio_s, devs[conf->copies]);
70 67
71 /* allocate a r10bio with room for raid_disks entries in the bios array */ 68 /* allocate a r10bio with room for raid_disks entries in the bios array */
72 r10_bio = kzalloc(size, gfp_flags); 69 return kzalloc(size, gfp_flags);
73 if (!r10_bio && conf->mddev)
74 unplug_slaves(conf->mddev);
75
76 return r10_bio;
77} 70}
78 71
79static void r10bio_pool_free(void *r10_bio, void *data) 72static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
106 int nalloc; 99 int nalloc;
107 100
108 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 101 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
109 if (!r10_bio) { 102 if (!r10_bio)
110 unplug_slaves(conf->mddev);
111 return NULL; 103 return NULL;
112 }
113 104
114 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) 105 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
115 nalloc = conf->copies; /* resync */ 106 nalloc = conf->copies; /* resync */
@@ -597,37 +588,6 @@ rb_out:
597 return disk; 588 return disk;
598} 589}
599 590
600static void unplug_slaves(mddev_t *mddev)
601{
602 conf_t *conf = mddev->private;
603 int i;
604
605 rcu_read_lock();
606 for (i=0; i < conf->raid_disks; i++) {
607 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
608 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
609 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
610
611 atomic_inc(&rdev->nr_pending);
612 rcu_read_unlock();
613
614 blk_unplug(r_queue);
615
616 rdev_dec_pending(rdev, mddev);
617 rcu_read_lock();
618 }
619 }
620 rcu_read_unlock();
621}
622
623static void raid10_unplug(struct request_queue *q)
624{
625 mddev_t *mddev = q->queuedata;
626
627 unplug_slaves(q->queuedata);
628 md_wakeup_thread(mddev->thread);
629}
630
631static int raid10_congested(void *data, int bits) 591static int raid10_congested(void *data, int bits)
632{ 592{
633 mddev_t *mddev = data; 593 mddev_t *mddev = data;
@@ -649,23 +609,16 @@ static int raid10_congested(void *data, int bits)
649 return ret; 609 return ret;
650} 610}
651 611
652static int flush_pending_writes(conf_t *conf) 612static void flush_pending_writes(conf_t *conf)
653{ 613{
654 /* Any writes that have been queued but are awaiting 614 /* Any writes that have been queued but are awaiting
655 * bitmap updates get flushed here. 615 * bitmap updates get flushed here.
656 * We return 1 if any requests were actually submitted.
657 */ 616 */
658 int rv = 0;
659
660 spin_lock_irq(&conf->device_lock); 617 spin_lock_irq(&conf->device_lock);
661 618
662 if (conf->pending_bio_list.head) { 619 if (conf->pending_bio_list.head) {
663 struct bio *bio; 620 struct bio *bio;
664 bio = bio_list_get(&conf->pending_bio_list); 621 bio = bio_list_get(&conf->pending_bio_list);
665 /* Spinlock only taken to quiet a warning */
666 spin_lock(conf->mddev->queue->queue_lock);
667 blk_remove_plug(conf->mddev->queue);
668 spin_unlock(conf->mddev->queue->queue_lock);
669 spin_unlock_irq(&conf->device_lock); 622 spin_unlock_irq(&conf->device_lock);
670 /* flush any pending bitmap writes to disk 623 /* flush any pending bitmap writes to disk
671 * before proceeding w/ I/O */ 624 * before proceeding w/ I/O */
@@ -677,11 +630,16 @@ static int flush_pending_writes(conf_t *conf)
677 generic_make_request(bio); 630 generic_make_request(bio);
678 bio = next; 631 bio = next;
679 } 632 }
680 rv = 1;
681 } else 633 } else
682 spin_unlock_irq(&conf->device_lock); 634 spin_unlock_irq(&conf->device_lock);
683 return rv;
684} 635}
636
637static void md_kick_device(mddev_t *mddev)
638{
639 blk_flush_plug(current);
640 md_wakeup_thread(mddev->thread);
641}
642
685/* Barriers.... 643/* Barriers....
686 * Sometimes we need to suspend IO while we do something else, 644 * Sometimes we need to suspend IO while we do something else,
687 * either some resync/recovery, or reconfigure the array. 645 * either some resync/recovery, or reconfigure the array.
@@ -711,8 +669,7 @@ static void raise_barrier(conf_t *conf, int force)
711 669
712 /* Wait until no block IO is waiting (unless 'force') */ 670 /* Wait until no block IO is waiting (unless 'force') */
713 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 671 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
714 conf->resync_lock, 672 conf->resync_lock, md_kick_device(conf->mddev));
715 raid10_unplug(conf->mddev->queue));
716 673
717 /* block any new IO from starting */ 674 /* block any new IO from starting */
718 conf->barrier++; 675 conf->barrier++;
@@ -720,8 +677,7 @@ static void raise_barrier(conf_t *conf, int force)
720 /* No wait for all pending IO to complete */ 677 /* No wait for all pending IO to complete */
721 wait_event_lock_irq(conf->wait_barrier, 678 wait_event_lock_irq(conf->wait_barrier,
722 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 679 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
723 conf->resync_lock, 680 conf->resync_lock, md_kick_device(conf->mddev));
724 raid10_unplug(conf->mddev->queue));
725 681
726 spin_unlock_irq(&conf->resync_lock); 682 spin_unlock_irq(&conf->resync_lock);
727} 683}
@@ -742,7 +698,7 @@ static void wait_barrier(conf_t *conf)
742 conf->nr_waiting++; 698 conf->nr_waiting++;
743 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 699 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
744 conf->resync_lock, 700 conf->resync_lock,
745 raid10_unplug(conf->mddev->queue)); 701 md_kick_device(conf->mddev));
746 conf->nr_waiting--; 702 conf->nr_waiting--;
747 } 703 }
748 conf->nr_pending++; 704 conf->nr_pending++;
@@ -779,7 +735,7 @@ static void freeze_array(conf_t *conf)
779 conf->nr_pending == conf->nr_queued+1, 735 conf->nr_pending == conf->nr_queued+1,
780 conf->resync_lock, 736 conf->resync_lock,
781 ({ flush_pending_writes(conf); 737 ({ flush_pending_writes(conf);
782 raid10_unplug(conf->mddev->queue); })); 738 md_kick_device(conf->mddev); }));
783 spin_unlock_irq(&conf->resync_lock); 739 spin_unlock_irq(&conf->resync_lock);
784} 740}
785 741
@@ -974,7 +930,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
974 atomic_inc(&r10_bio->remaining); 930 atomic_inc(&r10_bio->remaining);
975 spin_lock_irqsave(&conf->device_lock, flags); 931 spin_lock_irqsave(&conf->device_lock, flags);
976 bio_list_add(&conf->pending_bio_list, mbio); 932 bio_list_add(&conf->pending_bio_list, mbio);
977 blk_plug_device_unlocked(mddev->queue);
978 spin_unlock_irqrestore(&conf->device_lock, flags); 933 spin_unlock_irqrestore(&conf->device_lock, flags);
979 } 934 }
980 935
@@ -991,7 +946,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
991 /* In case raid10d snuck in to freeze_array */ 946 /* In case raid10d snuck in to freeze_array */
992 wake_up(&conf->wait_barrier); 947 wake_up(&conf->wait_barrier);
993 948
994 if (do_sync) 949 if (do_sync || !mddev->bitmap)
995 md_wakeup_thread(mddev->thread); 950 md_wakeup_thread(mddev->thread);
996 951
997 return 0; 952 return 0;
@@ -1233,7 +1188,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1233 p->rdev = rdev; 1188 p->rdev = rdev;
1234 goto abort; 1189 goto abort;
1235 } 1190 }
1236 md_integrity_register(mddev); 1191 err = md_integrity_register(mddev);
1237 } 1192 }
1238abort: 1193abort:
1239 1194
@@ -1684,7 +1639,6 @@ static void raid10d(mddev_t *mddev)
1684 unsigned long flags; 1639 unsigned long flags;
1685 conf_t *conf = mddev->private; 1640 conf_t *conf = mddev->private;
1686 struct list_head *head = &conf->retry_list; 1641 struct list_head *head = &conf->retry_list;
1687 int unplug=0;
1688 mdk_rdev_t *rdev; 1642 mdk_rdev_t *rdev;
1689 1643
1690 md_check_recovery(mddev); 1644 md_check_recovery(mddev);
@@ -1692,7 +1646,7 @@ static void raid10d(mddev_t *mddev)
1692 for (;;) { 1646 for (;;) {
1693 char b[BDEVNAME_SIZE]; 1647 char b[BDEVNAME_SIZE];
1694 1648
1695 unplug += flush_pending_writes(conf); 1649 flush_pending_writes(conf);
1696 1650
1697 spin_lock_irqsave(&conf->device_lock, flags); 1651 spin_lock_irqsave(&conf->device_lock, flags);
1698 if (list_empty(head)) { 1652 if (list_empty(head)) {
@@ -1706,13 +1660,11 @@ static void raid10d(mddev_t *mddev)
1706 1660
1707 mddev = r10_bio->mddev; 1661 mddev = r10_bio->mddev;
1708 conf = mddev->private; 1662 conf = mddev->private;
1709 if (test_bit(R10BIO_IsSync, &r10_bio->state)) { 1663 if (test_bit(R10BIO_IsSync, &r10_bio->state))
1710 sync_request_write(mddev, r10_bio); 1664 sync_request_write(mddev, r10_bio);
1711 unplug = 1; 1665 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1712 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1713 recovery_request_write(mddev, r10_bio); 1666 recovery_request_write(mddev, r10_bio);
1714 unplug = 1; 1667 else {
1715 } else {
1716 int mirror; 1668 int mirror;
1717 /* we got a read error. Maybe the drive is bad. Maybe just 1669 /* we got a read error. Maybe the drive is bad. Maybe just
1718 * the block and we can fix it. 1670 * the block and we can fix it.
@@ -1759,14 +1711,11 @@ static void raid10d(mddev_t *mddev)
1759 bio->bi_rw = READ | do_sync; 1711 bio->bi_rw = READ | do_sync;
1760 bio->bi_private = r10_bio; 1712 bio->bi_private = r10_bio;
1761 bio->bi_end_io = raid10_end_read_request; 1713 bio->bi_end_io = raid10_end_read_request;
1762 unplug = 1;
1763 generic_make_request(bio); 1714 generic_make_request(bio);
1764 } 1715 }
1765 } 1716 }
1766 cond_resched(); 1717 cond_resched();
1767 } 1718 }
1768 if (unplug)
1769 unplug_slaves(mddev);
1770} 1719}
1771 1720
1772 1721
@@ -2377,7 +2326,6 @@ static int run(mddev_t *mddev)
2377 md_set_array_sectors(mddev, size); 2326 md_set_array_sectors(mddev, size);
2378 mddev->resync_max_sectors = size; 2327 mddev->resync_max_sectors = size;
2379 2328
2380 mddev->queue->unplug_fn = raid10_unplug;
2381 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 2329 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2382 mddev->queue->backing_dev_info.congested_data = mddev; 2330 mddev->queue->backing_dev_info.congested_data = mddev;
2383 2331
@@ -2395,7 +2343,10 @@ static int run(mddev_t *mddev)
2395 2343
2396 if (conf->near_copies < conf->raid_disks) 2344 if (conf->near_copies < conf->raid_disks)
2397 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2345 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2398 md_integrity_register(mddev); 2346
2347 if (md_integrity_register(mddev))
2348 goto out_free_conf;
2349
2399 return 0; 2350 return 0;
2400 2351
2401out_free_conf: 2352out_free_conf:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78536fdbd87f..e867ee42b152 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -433,8 +433,6 @@ static int has_failed(raid5_conf_t *conf)
433 return 0; 433 return 0;
434} 434}
435 435
436static void unplug_slaves(mddev_t *mddev);
437
438static struct stripe_head * 436static struct stripe_head *
439get_active_stripe(raid5_conf_t *conf, sector_t sector, 437get_active_stripe(raid5_conf_t *conf, sector_t sector,
440 int previous, int noblock, int noquiesce) 438 int previous, int noblock, int noquiesce)
@@ -463,8 +461,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
463 < (conf->max_nr_stripes *3/4) 461 < (conf->max_nr_stripes *3/4)
464 || !conf->inactive_blocked), 462 || !conf->inactive_blocked),
465 conf->device_lock, 463 conf->device_lock,
466 md_raid5_unplug_device(conf) 464 md_raid5_kick_device(conf));
467 );
468 conf->inactive_blocked = 0; 465 conf->inactive_blocked = 0;
469 } else 466 } else
470 init_stripe(sh, sector, previous); 467 init_stripe(sh, sector, previous);
@@ -1473,8 +1470,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1473 wait_event_lock_irq(conf->wait_for_stripe, 1470 wait_event_lock_irq(conf->wait_for_stripe,
1474 !list_empty(&conf->inactive_list), 1471 !list_empty(&conf->inactive_list),
1475 conf->device_lock, 1472 conf->device_lock,
1476 unplug_slaves(conf->mddev) 1473 blk_flush_plug(current));
1477 );
1478 osh = get_free_stripe(conf); 1474 osh = get_free_stripe(conf);
1479 spin_unlock_irq(&conf->device_lock); 1475 spin_unlock_irq(&conf->device_lock);
1480 atomic_set(&nsh->count, 1); 1476 atomic_set(&nsh->count, 1);
@@ -3645,58 +3641,19 @@ static void activate_bit_delay(raid5_conf_t *conf)
3645 } 3641 }
3646} 3642}
3647 3643
3648static void unplug_slaves(mddev_t *mddev) 3644void md_raid5_kick_device(raid5_conf_t *conf)
3649{ 3645{
3650 raid5_conf_t *conf = mddev->private; 3646 blk_flush_plug(current);
3651 int i; 3647 raid5_activate_delayed(conf);
3652 int devs = max(conf->raid_disks, conf->previous_raid_disks);
3653
3654 rcu_read_lock();
3655 for (i = 0; i < devs; i++) {
3656 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3657 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3658 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3659
3660 atomic_inc(&rdev->nr_pending);
3661 rcu_read_unlock();
3662
3663 blk_unplug(r_queue);
3664
3665 rdev_dec_pending(rdev, mddev);
3666 rcu_read_lock();
3667 }
3668 }
3669 rcu_read_unlock();
3670}
3671
3672void md_raid5_unplug_device(raid5_conf_t *conf)
3673{
3674 unsigned long flags;
3675
3676 spin_lock_irqsave(&conf->device_lock, flags);
3677
3678 if (plugger_remove_plug(&conf->plug)) {
3679 conf->seq_flush++;
3680 raid5_activate_delayed(conf);
3681 }
3682 md_wakeup_thread(conf->mddev->thread); 3648 md_wakeup_thread(conf->mddev->thread);
3683
3684 spin_unlock_irqrestore(&conf->device_lock, flags);
3685
3686 unplug_slaves(conf->mddev);
3687} 3649}
3688EXPORT_SYMBOL_GPL(md_raid5_unplug_device); 3650EXPORT_SYMBOL_GPL(md_raid5_kick_device);
3689 3651
3690static void raid5_unplug(struct plug_handle *plug) 3652static void raid5_unplug(struct plug_handle *plug)
3691{ 3653{
3692 raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); 3654 raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
3693 md_raid5_unplug_device(conf);
3694}
3695 3655
3696static void raid5_unplug_queue(struct request_queue *q) 3656 md_raid5_kick_device(conf);
3697{
3698 mddev_t *mddev = q->queuedata;
3699 md_raid5_unplug_device(mddev->private);
3700} 3657}
3701 3658
3702int md_raid5_congested(mddev_t *mddev, int bits) 3659int md_raid5_congested(mddev_t *mddev, int bits)
@@ -4100,7 +4057,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4100 * add failed due to overlap. Flush everything 4057 * add failed due to overlap. Flush everything
4101 * and wait a while 4058 * and wait a while
4102 */ 4059 */
4103 md_raid5_unplug_device(conf); 4060 md_raid5_kick_device(conf);
4104 release_stripe(sh); 4061 release_stripe(sh);
4105 schedule(); 4062 schedule();
4106 goto retry; 4063 goto retry;
@@ -4365,7 +4322,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4365 4322
4366 if (sector_nr >= max_sector) { 4323 if (sector_nr >= max_sector) {
4367 /* just being told to finish up .. nothing much to do */ 4324 /* just being told to finish up .. nothing much to do */
4368 unplug_slaves(mddev);
4369 4325
4370 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4326 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4371 end_reshape(conf); 4327 end_reshape(conf);
@@ -4569,7 +4525,6 @@ static void raid5d(mddev_t *mddev)
4569 spin_unlock_irq(&conf->device_lock); 4525 spin_unlock_irq(&conf->device_lock);
4570 4526
4571 async_tx_issue_pending_all(); 4527 async_tx_issue_pending_all();
4572 unplug_slaves(mddev);
4573 4528
4574 pr_debug("--- raid5d inactive\n"); 4529 pr_debug("--- raid5d inactive\n");
4575} 4530}
@@ -5204,7 +5159,7 @@ static int run(mddev_t *mddev)
5204 5159
5205 mddev->queue->backing_dev_info.congested_data = mddev; 5160 mddev->queue->backing_dev_info.congested_data = mddev;
5206 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5161 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
5207 mddev->queue->unplug_fn = raid5_unplug_queue; 5162 mddev->queue->queue_lock = &conf->device_lock;
5208 5163
5209 chunk_size = mddev->chunk_sectors << 9; 5164 chunk_size = mddev->chunk_sectors << 9;
5210 blk_queue_io_min(mddev->queue, chunk_size); 5165 blk_queue_io_min(mddev->queue, chunk_size);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2ace0582b409..8d563a4f022a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -503,6 +503,6 @@ static inline int algorithm_is_DDF(int layout)
503} 503}
504 504
505extern int md_raid5_congested(mddev_t *mddev, int bits); 505extern int md_raid5_congested(mddev_t *mddev, int bits);
506extern void md_raid5_unplug_device(raid5_conf_t *conf); 506extern void md_raid5_kick_device(raid5_conf_t *conf);
507extern int raid5_set_cache_size(mddev_t *mddev, int size); 507extern int raid5_set_cache_size(mddev_t *mddev, int size);
508#endif 508#endif
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index ae7cad185898..47ec5bc0ed21 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -695,20 +695,22 @@ static int i2o_block_ioctl(struct block_device *bdev, fmode_t mode,
695}; 695};
696 696
697/** 697/**
698 * i2o_block_media_changed - Have we seen a media change? 698 * i2o_block_check_events - Have we seen a media change?
699 * @disk: gendisk which should be verified 699 * @disk: gendisk which should be verified
700 * @clearing: events being cleared
700 * 701 *
701 * Verifies if the media has changed. 702 * Verifies if the media has changed.
702 * 703 *
703 * Returns 1 if the media was changed or 0 otherwise. 704 * Returns 1 if the media was changed or 0 otherwise.
704 */ 705 */
705static int i2o_block_media_changed(struct gendisk *disk) 706static unsigned int i2o_block_check_events(struct gendisk *disk,
707 unsigned int clearing)
706{ 708{
707 struct i2o_block_device *p = disk->private_data; 709 struct i2o_block_device *p = disk->private_data;
708 710
709 if (p->media_change_flag) { 711 if (p->media_change_flag) {
710 p->media_change_flag = 0; 712 p->media_change_flag = 0;
711 return 1; 713 return DISK_EVENT_MEDIA_CHANGE;
712 } 714 }
713 return 0; 715 return 0;
714} 716}
@@ -895,11 +897,7 @@ static void i2o_block_request_fn(struct request_queue *q)
895{ 897{
896 struct request *req; 898 struct request *req;
897 899
898 while (!blk_queue_plugged(q)) { 900 while ((req = blk_peek_request(q)) != NULL) {
899 req = blk_peek_request(q);
900 if (!req)
901 break;
902
903 if (req->cmd_type == REQ_TYPE_FS) { 901 if (req->cmd_type == REQ_TYPE_FS) {
904 struct i2o_block_delayed_request *dreq; 902 struct i2o_block_delayed_request *dreq;
905 struct i2o_block_request *ireq = req->special; 903 struct i2o_block_request *ireq = req->special;
@@ -950,7 +948,7 @@ static const struct block_device_operations i2o_block_fops = {
950 .ioctl = i2o_block_ioctl, 948 .ioctl = i2o_block_ioctl,
951 .compat_ioctl = i2o_block_ioctl, 949 .compat_ioctl = i2o_block_ioctl,
952 .getgeo = i2o_block_getgeo, 950 .getgeo = i2o_block_getgeo,
953 .media_changed = i2o_block_media_changed 951 .check_events = i2o_block_check_events,
954}; 952};
955 953
956/** 954/**
@@ -1002,6 +1000,7 @@ static struct i2o_block_device *i2o_block_device_alloc(void)
1002 gd->major = I2O_MAJOR; 1000 gd->major = I2O_MAJOR;
1003 gd->queue = queue; 1001 gd->queue = queue;
1004 gd->fops = &i2o_block_fops; 1002 gd->fops = &i2o_block_fops;
1003 gd->events = DISK_EVENT_MEDIA_CHANGE;
1005 gd->private_data = dev; 1004 gd->private_data = dev;
1006 1005
1007 dev->gd = gd; 1006 dev->gd = gd;
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 4e42d030e097..2ae727568df9 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -55,8 +55,7 @@ static int mmc_queue_thread(void *d)
55 55
56 spin_lock_irq(q->queue_lock); 56 spin_lock_irq(q->queue_lock);
57 set_current_state(TASK_INTERRUPTIBLE); 57 set_current_state(TASK_INTERRUPTIBLE);
58 if (!blk_queue_plugged(q)) 58 req = blk_fetch_request(q);
59 req = blk_fetch_request(q);
60 mq->req = req; 59 mq->req = req;
61 spin_unlock_irq(q->queue_lock); 60 spin_unlock_irq(q->queue_lock);
62 61
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 794bfd962266..4d2df2f76ea0 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1917,7 +1917,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
1917 return; 1917 return;
1918 } 1918 }
1919 /* Now we try to fetch requests from the request queue */ 1919 /* Now we try to fetch requests from the request queue */
1920 while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) { 1920 while ((req = blk_peek_request(queue))) {
1921 if (basedev->features & DASD_FEATURE_READONLY && 1921 if (basedev->features & DASD_FEATURE_READONLY &&
1922 rq_data_dir(req) == WRITE) { 1922 rq_data_dir(req) == WRITE) {
1923 DBF_DEV_EVENT(DBF_ERR, basedev, 1923 DBF_DEV_EVENT(DBF_ERR, basedev,
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 55d2d0f4eabc..83cea9a55e2f 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -48,14 +48,14 @@
48static DEFINE_MUTEX(tape_block_mutex); 48static DEFINE_MUTEX(tape_block_mutex);
49static int tapeblock_open(struct block_device *, fmode_t); 49static int tapeblock_open(struct block_device *, fmode_t);
50static int tapeblock_release(struct gendisk *, fmode_t); 50static int tapeblock_release(struct gendisk *, fmode_t);
51static int tapeblock_medium_changed(struct gendisk *); 51static unsigned int tapeblock_check_events(struct gendisk *, unsigned int);
52static int tapeblock_revalidate_disk(struct gendisk *); 52static int tapeblock_revalidate_disk(struct gendisk *);
53 53
54static const struct block_device_operations tapeblock_fops = { 54static const struct block_device_operations tapeblock_fops = {
55 .owner = THIS_MODULE, 55 .owner = THIS_MODULE,
56 .open = tapeblock_open, 56 .open = tapeblock_open,
57 .release = tapeblock_release, 57 .release = tapeblock_release,
58 .media_changed = tapeblock_medium_changed, 58 .check_events = tapeblock_check_events,
59 .revalidate_disk = tapeblock_revalidate_disk, 59 .revalidate_disk = tapeblock_revalidate_disk,
60}; 60};
61 61
@@ -161,7 +161,6 @@ tapeblock_requeue(struct work_struct *work) {
161 161
162 spin_lock_irq(&device->blk_data.request_queue_lock); 162 spin_lock_irq(&device->blk_data.request_queue_lock);
163 while ( 163 while (
164 !blk_queue_plugged(queue) &&
165 blk_peek_request(queue) && 164 blk_peek_request(queue) &&
166 nr_queued < TAPEBLOCK_MIN_REQUEUE 165 nr_queued < TAPEBLOCK_MIN_REQUEUE
167 ) { 166 ) {
@@ -237,6 +236,7 @@ tapeblock_setup_device(struct tape_device * device)
237 disk->major = tapeblock_major; 236 disk->major = tapeblock_major;
238 disk->first_minor = device->first_minor; 237 disk->first_minor = device->first_minor;
239 disk->fops = &tapeblock_fops; 238 disk->fops = &tapeblock_fops;
239 disk->events = DISK_EVENT_MEDIA_CHANGE;
240 disk->private_data = tape_get_device(device); 240 disk->private_data = tape_get_device(device);
241 disk->queue = blkdat->request_queue; 241 disk->queue = blkdat->request_queue;
242 set_capacity(disk, 0); 242 set_capacity(disk, 0);
@@ -340,8 +340,8 @@ tapeblock_revalidate_disk(struct gendisk *disk)
340 return 0; 340 return 0;
341} 341}
342 342
343static int 343static unsigned int
344tapeblock_medium_changed(struct gendisk *disk) 344tapeblock_check_events(struct gendisk *disk, unsigned int clearing)
345{ 345{
346 struct tape_device *device; 346 struct tape_device *device;
347 347
@@ -349,7 +349,7 @@ tapeblock_medium_changed(struct gendisk *disk)
349 DBF_LH(6, "tapeblock_medium_changed(%p) = %d\n", 349 DBF_LH(6, "tapeblock_medium_changed(%p) = %d\n",
350 device, device->blk_data.medium_changed); 350 device, device->blk_data.medium_changed);
351 351
352 return device->blk_data.medium_changed; 352 return device->blk_data.medium_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
353} 353}
354 354
355/* 355/*
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2d63c8ad1442..6d5c7ff43f5b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -67,6 +67,13 @@ static struct scsi_host_sg_pool scsi_sg_pools[] = {
67 67
68struct kmem_cache *scsi_sdb_cache; 68struct kmem_cache *scsi_sdb_cache;
69 69
70/*
71 * When to reinvoke queueing after a resource shortage. It's 3 msecs to
72 * not change behaviour from the previous unplug mechanism, experimentation
73 * may prove this needs changing.
74 */
75#define SCSI_QUEUE_DELAY 3
76
70static void scsi_run_queue(struct request_queue *q); 77static void scsi_run_queue(struct request_queue *q);
71 78
72/* 79/*
@@ -149,14 +156,7 @@ static int __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
149 /* 156 /*
150 * Requeue this command. It will go before all other commands 157 * Requeue this command. It will go before all other commands
151 * that are already in the queue. 158 * that are already in the queue.
152 * 159 */
153 * NOTE: there is magic here about the way the queue is plugged if
154 * we have no outstanding commands.
155 *
156 * Although we *don't* plug the queue, we call the request
157 * function. The SCSI request function detects the blocked condition
158 * and plugs the queue appropriately.
159 */
160 spin_lock_irqsave(q->queue_lock, flags); 160 spin_lock_irqsave(q->queue_lock, flags);
161 blk_requeue_request(q, cmd->request); 161 blk_requeue_request(q, cmd->request);
162 spin_unlock_irqrestore(q->queue_lock, flags); 162 spin_unlock_irqrestore(q->queue_lock, flags);
@@ -1226,11 +1226,11 @@ int scsi_prep_return(struct request_queue *q, struct request *req, int ret)
1226 case BLKPREP_DEFER: 1226 case BLKPREP_DEFER:
1227 /* 1227 /*
1228 * If we defer, the blk_peek_request() returns NULL, but the 1228 * If we defer, the blk_peek_request() returns NULL, but the
1229 * queue must be restarted, so we plug here if no returning 1229 * queue must be restarted, so we schedule a callback to happen
1230 * command will automatically do that. 1230 * shortly.
1231 */ 1231 */
1232 if (sdev->device_busy == 0) 1232 if (sdev->device_busy == 0)
1233 blk_plug_device(q); 1233 blk_delay_queue(q, SCSI_QUEUE_DELAY);
1234 break; 1234 break;
1235 default: 1235 default:
1236 req->cmd_flags |= REQ_DONTPREP; 1236 req->cmd_flags |= REQ_DONTPREP;
@@ -1269,7 +1269,7 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
1269 sdev_printk(KERN_INFO, sdev, 1269 sdev_printk(KERN_INFO, sdev,
1270 "unblocking device at zero depth\n")); 1270 "unblocking device at zero depth\n"));
1271 } else { 1271 } else {
1272 blk_plug_device(q); 1272 blk_delay_queue(q, SCSI_QUEUE_DELAY);
1273 return 0; 1273 return 0;
1274 } 1274 }
1275 } 1275 }
@@ -1499,7 +1499,7 @@ static void scsi_request_fn(struct request_queue *q)
1499 * the host is no longer able to accept any more requests. 1499 * the host is no longer able to accept any more requests.
1500 */ 1500 */
1501 shost = sdev->host; 1501 shost = sdev->host;
1502 while (!blk_queue_plugged(q)) { 1502 for (;;) {
1503 int rtn; 1503 int rtn;
1504 /* 1504 /*
1505 * get next queueable request. We do this early to make sure 1505 * get next queueable request. We do this early to make sure
@@ -1578,15 +1578,8 @@ static void scsi_request_fn(struct request_queue *q)
1578 */ 1578 */
1579 rtn = scsi_dispatch_cmd(cmd); 1579 rtn = scsi_dispatch_cmd(cmd);
1580 spin_lock_irq(q->queue_lock); 1580 spin_lock_irq(q->queue_lock);
1581 if(rtn) { 1581 if (rtn)
1582 /* we're refusing the command; because of 1582 goto out_delay;
1583 * the way locks get dropped, we need to
1584 * check here if plugging is required */
1585 if(sdev->device_busy == 0)
1586 blk_plug_device(q);
1587
1588 break;
1589 }
1590 } 1583 }
1591 1584
1592 goto out; 1585 goto out;
@@ -1605,9 +1598,10 @@ static void scsi_request_fn(struct request_queue *q)
1605 spin_lock_irq(q->queue_lock); 1598 spin_lock_irq(q->queue_lock);
1606 blk_requeue_request(q, req); 1599 blk_requeue_request(q, req);
1607 sdev->device_busy--; 1600 sdev->device_busy--;
1608 if(sdev->device_busy == 0) 1601out_delay:
1609 blk_plug_device(q); 1602 if (sdev->device_busy == 0)
1610 out: 1603 blk_delay_queue(q, SCSI_QUEUE_DELAY);
1604out:
1611 /* must be careful here...if we trigger the ->remove() function 1605 /* must be careful here...if we trigger the ->remove() function
1612 * we cannot be holding the q lock */ 1606 * we cannot be holding the q lock */
1613 spin_unlock_irq(q->queue_lock); 1607 spin_unlock_irq(q->queue_lock);
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 5c3ccfc6b622..2941d2d92c94 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3913,7 +3913,7 @@ fc_bsg_request_handler(struct request_queue *q, struct Scsi_Host *shost,
3913 if (!get_device(dev)) 3913 if (!get_device(dev))
3914 return; 3914 return;
3915 3915
3916 while (!blk_queue_plugged(q)) { 3916 while (1) {
3917 if (rport && (rport->port_state == FC_PORTSTATE_BLOCKED) && 3917 if (rport && (rport->port_state == FC_PORTSTATE_BLOCKED) &&
3918 !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT)) 3918 !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
3919 break; 3919 break;
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 927e99cb7225..c6fcf76cade5 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -173,11 +173,7 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
173 int ret; 173 int ret;
174 int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *); 174 int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *);
175 175
176 while (!blk_queue_plugged(q)) { 176 while ((req = blk_fetch_request(q)) != NULL) {
177 req = blk_fetch_request(q);
178 if (!req)
179 break;
180
181 spin_unlock_irq(q->queue_lock); 177 spin_unlock_irq(q->queue_lock);
182 178
183 handler = to_sas_internal(shost->transportt)->f->smp_handler; 179 handler = to_sas_internal(shost->transportt)->f->smp_handler;
diff --git a/drivers/staging/hv/blkvsc_drv.c b/drivers/staging/hv/blkvsc_drv.c
index 6e02f1b0c46f..af789937be4e 100644
--- a/drivers/staging/hv/blkvsc_drv.c
+++ b/drivers/staging/hv/blkvsc_drv.c
@@ -124,7 +124,8 @@ static void blkvsc_shutdown(struct device *device);
124 124
125static int blkvsc_open(struct block_device *bdev, fmode_t mode); 125static int blkvsc_open(struct block_device *bdev, fmode_t mode);
126static int blkvsc_release(struct gendisk *disk, fmode_t mode); 126static int blkvsc_release(struct gendisk *disk, fmode_t mode);
127static int blkvsc_media_changed(struct gendisk *gd); 127static unsigned int blkvsc_check_events(struct gendisk *gd,
128 unsigned int clearing);
128static int blkvsc_revalidate_disk(struct gendisk *gd); 129static int blkvsc_revalidate_disk(struct gendisk *gd);
129static int blkvsc_getgeo(struct block_device *bd, struct hd_geometry *hg); 130static int blkvsc_getgeo(struct block_device *bd, struct hd_geometry *hg);
130static int blkvsc_ioctl(struct block_device *bd, fmode_t mode, 131static int blkvsc_ioctl(struct block_device *bd, fmode_t mode,
@@ -155,7 +156,7 @@ static const struct block_device_operations block_ops = {
155 .owner = THIS_MODULE, 156 .owner = THIS_MODULE,
156 .open = blkvsc_open, 157 .open = blkvsc_open,
157 .release = blkvsc_release, 158 .release = blkvsc_release,
158 .media_changed = blkvsc_media_changed, 159 .check_events = blkvsc_check_events,
159 .revalidate_disk = blkvsc_revalidate_disk, 160 .revalidate_disk = blkvsc_revalidate_disk,
160 .getgeo = blkvsc_getgeo, 161 .getgeo = blkvsc_getgeo,
161 .ioctl = blkvsc_ioctl, 162 .ioctl = blkvsc_ioctl,
@@ -357,6 +358,7 @@ static int blkvsc_probe(struct device *device)
357 else 358 else
358 blkdev->gd->first_minor = 0; 359 blkdev->gd->first_minor = 0;
359 blkdev->gd->fops = &block_ops; 360 blkdev->gd->fops = &block_ops;
361 blkdev->gd->events = DISK_EVENT_MEDIA_CHANGE;
360 blkdev->gd->private_data = blkdev; 362 blkdev->gd->private_data = blkdev;
361 blkdev->gd->driverfs_dev = &(blkdev->device_ctx->device); 363 blkdev->gd->driverfs_dev = &(blkdev->device_ctx->device);
362 sprintf(blkdev->gd->disk_name, "hd%c", 'a' + devnum); 364 sprintf(blkdev->gd->disk_name, "hd%c", 'a' + devnum);
@@ -1337,10 +1339,11 @@ static int blkvsc_release(struct gendisk *disk, fmode_t mode)
1337 return 0; 1339 return 0;
1338} 1340}
1339 1341
1340static int blkvsc_media_changed(struct gendisk *gd) 1342static unsigned int blkvsc_check_events(struct gendisk *gd,
1343 unsigned int clearing)
1341{ 1344{
1342 DPRINT_DBG(BLKVSC_DRV, "- enter\n"); 1345 DPRINT_DBG(BLKVSC_DRV, "- enter\n");
1343 return 1; 1346 return DISK_EVENT_MEDIA_CHANGE;
1344} 1347}
1345 1348
1346static int blkvsc_revalidate_disk(struct gendisk *gd) 1349static int blkvsc_revalidate_disk(struct gendisk *gd)
diff --git a/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c b/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
index e1851f00be56..842cd9214a5e 100644
--- a/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
+++ b/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
@@ -381,10 +381,10 @@ static int cyasblkdev_blk_ioctl(
381 return -ENOTTY; 381 return -ENOTTY;
382} 382}
383 383
384/* Media_changed block_device opp 384/* check_events block_device opp
385 * this one is called by kernel to confirm if the media really changed 385 * this one is called by kernel to confirm if the media really changed
386 * as we indicated by issuing check_disk_change() call */ 386 * as we indicated by issuing check_disk_change() call */
387int cyasblkdev_media_changed(struct gendisk *gd) 387unsigned int cyasblkdev_check_events(struct gendisk *gd, unsigned int clearing)
388{ 388{
389 struct cyasblkdev_blk_data *bd; 389 struct cyasblkdev_blk_data *bd;
390 390
@@ -402,7 +402,7 @@ int cyasblkdev_media_changed(struct gendisk *gd)
402 #endif 402 #endif
403 } 403 }
404 404
405 /* return media change state "1" yes, 0 no */ 405 /* return media change state - DISK_EVENT_MEDIA_CHANGE yes, 0 no */
406 return 0; 406 return 0;
407} 407}
408 408
@@ -432,7 +432,7 @@ static struct block_device_operations cyasblkdev_bdops = {
432 .ioctl = cyasblkdev_blk_ioctl, 432 .ioctl = cyasblkdev_blk_ioctl,
433 /* .getgeo = cyasblkdev_blk_getgeo, */ 433 /* .getgeo = cyasblkdev_blk_getgeo, */
434 /* added to support media removal( real and simulated) media */ 434 /* added to support media removal( real and simulated) media */
435 .media_changed = cyasblkdev_media_changed, 435 .check_events = cyasblkdev_check_events,
436 /* added to support media removal( real and simulated) media */ 436 /* added to support media removal( real and simulated) media */
437 .revalidate_disk = cyasblkdev_revalidate_disk, 437 .revalidate_disk = cyasblkdev_revalidate_disk,
438 .owner = THIS_MODULE, 438 .owner = THIS_MODULE,
@@ -1090,6 +1090,7 @@ static int cyasblkdev_add_disks(int bus_num,
1090 bd->user_disk_0->first_minor = devidx << CYASBLKDEV_SHIFT; 1090 bd->user_disk_0->first_minor = devidx << CYASBLKDEV_SHIFT;
1091 bd->user_disk_0->minors = 8; 1091 bd->user_disk_0->minors = 8;
1092 bd->user_disk_0->fops = &cyasblkdev_bdops; 1092 bd->user_disk_0->fops = &cyasblkdev_bdops;
1093 bd->user_disk_0->events = DISK_EVENT_MEDIA_CHANGE;
1093 bd->user_disk_0->private_data = bd; 1094 bd->user_disk_0->private_data = bd;
1094 bd->user_disk_0->queue = bd->queue.queue; 1095 bd->user_disk_0->queue = bd->queue.queue;
1095 bd->dbgprn_flags = DBGPRN_RD_RQ; 1096 bd->dbgprn_flags = DBGPRN_RD_RQ;
@@ -1190,6 +1191,7 @@ static int cyasblkdev_add_disks(int bus_num,
1190 bd->user_disk_1->first_minor = (devidx + 1) << CYASBLKDEV_SHIFT; 1191 bd->user_disk_1->first_minor = (devidx + 1) << CYASBLKDEV_SHIFT;
1191 bd->user_disk_1->minors = 8; 1192 bd->user_disk_1->minors = 8;
1192 bd->user_disk_1->fops = &cyasblkdev_bdops; 1193 bd->user_disk_1->fops = &cyasblkdev_bdops;
1194 bd->user_disk_0->events = DISK_EVENT_MEDIA_CHANGE;
1193 bd->user_disk_1->private_data = bd; 1195 bd->user_disk_1->private_data = bd;
1194 bd->user_disk_1->queue = bd->queue.queue; 1196 bd->user_disk_1->queue = bd->queue.queue;
1195 bd->dbgprn_flags = DBGPRN_RD_RQ; 1197 bd->dbgprn_flags = DBGPRN_RD_RQ;
@@ -1278,6 +1280,7 @@ static int cyasblkdev_add_disks(int bus_num,
1278 (devidx + 2) << CYASBLKDEV_SHIFT; 1280 (devidx + 2) << CYASBLKDEV_SHIFT;
1279 bd->system_disk->minors = 8; 1281 bd->system_disk->minors = 8;
1280 bd->system_disk->fops = &cyasblkdev_bdops; 1282 bd->system_disk->fops = &cyasblkdev_bdops;
1283 bd->system_disk->events = DISK_EVENT_MEDIA_CHANGE;
1281 bd->system_disk->private_data = bd; 1284 bd->system_disk->private_data = bd;
1282 bd->system_disk->queue = bd->queue.queue; 1285 bd->system_disk->queue = bd->queue.queue;
1283 /* don't search for vfat 1286 /* don't search for vfat
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 3df570db0e4f..eb0afec046e1 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -391,9 +391,8 @@ static int iblock_do_task(struct se_task *task)
391{ 391{
392 struct se_device *dev = task->task_se_cmd->se_dev; 392 struct se_device *dev = task->task_se_cmd->se_dev;
393 struct iblock_req *req = IBLOCK_REQ(task); 393 struct iblock_req *req = IBLOCK_REQ(task);
394 struct iblock_dev *ibd = (struct iblock_dev *)req->ib_dev;
395 struct request_queue *q = bdev_get_queue(ibd->ibd_bd);
396 struct bio *bio = req->ib_bio, *nbio = NULL; 394 struct bio *bio = req->ib_bio, *nbio = NULL;
395 struct blk_plug plug;
397 int rw; 396 int rw;
398 397
399 if (task->task_data_direction == DMA_TO_DEVICE) { 398 if (task->task_data_direction == DMA_TO_DEVICE) {
@@ -411,6 +410,7 @@ static int iblock_do_task(struct se_task *task)
411 rw = READ; 410 rw = READ;
412 } 411 }
413 412
413 blk_start_plug(&plug);
414 while (bio) { 414 while (bio) {
415 nbio = bio->bi_next; 415 nbio = bio->bi_next;
416 bio->bi_next = NULL; 416 bio->bi_next = NULL;
@@ -420,9 +420,8 @@ static int iblock_do_task(struct se_task *task)
420 submit_bio(rw, bio); 420 submit_bio(rw, bio);
421 bio = nbio; 421 bio = nbio;
422 } 422 }
423 blk_finish_plug(&plug);
423 424
424 if (q->unplug_fn)
425 q->unplug_fn(q);
426 return PYX_TRANSPORT_SENT_TO_TRANSPORT; 425 return PYX_TRANSPORT_SENT_TO_TRANSPORT;
427} 426}
428 427
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 92444e94f842..d5250c5aae21 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -72,7 +72,6 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
72static const struct address_space_operations adfs_aops = { 72static const struct address_space_operations adfs_aops = {
73 .readpage = adfs_readpage, 73 .readpage = adfs_readpage,
74 .writepage = adfs_writepage, 74 .writepage = adfs_writepage,
75 .sync_page = block_sync_page,
76 .write_begin = adfs_write_begin, 75 .write_begin = adfs_write_begin,
77 .write_end = generic_write_end, 76 .write_end = generic_write_end,
78 .bmap = _adfs_bmap 77 .bmap = _adfs_bmap
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de2..acf321b70fcd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,7 +429,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
429const struct address_space_operations affs_aops = { 429const struct address_space_operations affs_aops = {
430 .readpage = affs_readpage, 430 .readpage = affs_readpage,
431 .writepage = affs_writepage, 431 .writepage = affs_writepage,
432 .sync_page = block_sync_page,
433 .write_begin = affs_write_begin, 432 .write_begin = affs_write_begin,
434 .write_end = generic_write_end, 433 .write_end = generic_write_end,
435 .bmap = _affs_bmap 434 .bmap = _affs_bmap
@@ -786,7 +785,6 @@ out:
786const struct address_space_operations affs_aops_ofs = { 785const struct address_space_operations affs_aops_ofs = {
787 .readpage = affs_readpage_ofs, 786 .readpage = affs_readpage_ofs,
788 //.writepage = affs_writepage_ofs, 787 //.writepage = affs_writepage_ofs,
789 //.sync_page = affs_sync_page_ofs,
790 .write_begin = affs_write_begin_ofs, 788 .write_begin = affs_write_begin_ofs,
791 .write_end = affs_write_end_ofs 789 .write_end = affs_write_end_ofs
792}; 790};
diff --git a/fs/aio.c b/fs/aio.c
index ebb6a22e4e1b..e29ec485af25 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -34,8 +34,6 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/eventfd.h> 35#include <linux/eventfd.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h>
38#include <linux/hash.h>
39#include <linux/compat.h> 37#include <linux/compat.h>
40 38
41#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
@@ -65,14 +63,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
65static DEFINE_SPINLOCK(fput_lock); 63static DEFINE_SPINLOCK(fput_lock);
66static LIST_HEAD(fput_head); 64static LIST_HEAD(fput_head);
67 65
68#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
69#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
70struct aio_batch_entry {
71 struct hlist_node list;
72 struct address_space *mapping;
73};
74mempool_t *abe_pool;
75
76static void aio_kick_handler(struct work_struct *); 66static void aio_kick_handler(struct work_struct *);
77static void aio_queue_work(struct kioctx *); 67static void aio_queue_work(struct kioctx *);
78 68
@@ -86,8 +76,7 @@ static int __init aio_setup(void)
86 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 76 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
87 77
88 aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ 78 aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 79 BUG_ON(!aio_wq);
90 BUG_ON(!aio_wq || !abe_pool);
91 80
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 81 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 82
@@ -1525,57 +1514,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1525 return 0; 1514 return 0;
1526} 1515}
1527 1516
1528static void aio_batch_add(struct address_space *mapping,
1529 struct hlist_head *batch_hash)
1530{
1531 struct aio_batch_entry *abe;
1532 struct hlist_node *pos;
1533 unsigned bucket;
1534
1535 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
1536 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
1537 if (abe->mapping == mapping)
1538 return;
1539 }
1540
1541 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1542
1543 /*
1544 * we should be using igrab here, but
1545 * we don't want to hammer on the global
1546 * inode spinlock just to take an extra
1547 * reference on a file that we must already
1548 * have a reference to.
1549 *
1550 * When we're called, we always have a reference
1551 * on the file, so we must always have a reference
1552 * on the inode, so ihold() is safe here.
1553 */
1554 ihold(mapping->host);
1555 abe->mapping = mapping;
1556 hlist_add_head(&abe->list, &batch_hash[bucket]);
1557 return;
1558}
1559
1560static void aio_batch_free(struct hlist_head *batch_hash)
1561{
1562 struct aio_batch_entry *abe;
1563 struct hlist_node *pos, *n;
1564 int i;
1565
1566 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
1567 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
1568 blk_run_address_space(abe->mapping);
1569 iput(abe->mapping->host);
1570 hlist_del(&abe->list);
1571 mempool_free(abe, abe_pool);
1572 }
1573 }
1574}
1575
1576static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1517static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1577 struct iocb *iocb, struct hlist_head *batch_hash, 1518 struct iocb *iocb, bool compat)
1578 bool compat)
1579{ 1519{
1580 struct kiocb *req; 1520 struct kiocb *req;
1581 struct file *file; 1521 struct file *file;
@@ -1666,11 +1606,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1666 ; 1606 ;
1667 } 1607 }
1668 spin_unlock_irq(&ctx->ctx_lock); 1608 spin_unlock_irq(&ctx->ctx_lock);
1669 if (req->ki_opcode == IOCB_CMD_PREAD ||
1670 req->ki_opcode == IOCB_CMD_PREADV ||
1671 req->ki_opcode == IOCB_CMD_PWRITE ||
1672 req->ki_opcode == IOCB_CMD_PWRITEV)
1673 aio_batch_add(file->f_mapping, batch_hash);
1674 1609
1675 aio_put_req(req); /* drop extra ref to req */ 1610 aio_put_req(req); /* drop extra ref to req */
1676 return 0; 1611 return 0;
@@ -1687,7 +1622,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1687 struct kioctx *ctx; 1622 struct kioctx *ctx;
1688 long ret = 0; 1623 long ret = 0;
1689 int i; 1624 int i;
1690 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; 1625 struct blk_plug plug;
1691 1626
1692 if (unlikely(nr < 0)) 1627 if (unlikely(nr < 0))
1693 return -EINVAL; 1628 return -EINVAL;
@@ -1704,6 +1639,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1704 return -EINVAL; 1639 return -EINVAL;
1705 } 1640 }
1706 1641
1642 blk_start_plug(&plug);
1643
1707 /* 1644 /*
1708 * AKPM: should this return a partial result if some of the IOs were 1645 * AKPM: should this return a partial result if some of the IOs were
1709 * successfully submitted? 1646 * successfully submitted?
@@ -1722,11 +1659,11 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1722 break; 1659 break;
1723 } 1660 }
1724 1661
1725 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); 1662 ret = io_submit_one(ctx, user_iocb, &tmp, compat);
1726 if (ret) 1663 if (ret)
1727 break; 1664 break;
1728 } 1665 }
1729 aio_batch_free(batch_hash); 1666 blk_finish_plug(&plug);
1730 1667
1731 put_ioctx(ctx); 1668 put_ioctx(ctx);
1732 return i ? i : ret; 1669 return i ? i : ret;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747b..06457ed8f3e7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -75,7 +75,6 @@ static const struct inode_operations befs_dir_inode_operations = {
75 75
76static const struct address_space_operations befs_aops = { 76static const struct address_space_operations befs_aops = {
77 .readpage = befs_readpage, 77 .readpage = befs_readpage,
78 .sync_page = block_sync_page,
79 .bmap = befs_bmap, 78 .bmap = befs_bmap,
80}; 79};
81 80
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8ea..f20e8a71062f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -186,7 +186,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
186const struct address_space_operations bfs_aops = { 186const struct address_space_operations bfs_aops = {
187 .readpage = bfs_readpage, 187 .readpage = bfs_readpage,
188 .writepage = bfs_writepage, 188 .writepage = bfs_writepage,
189 .sync_page = block_sync_page,
190 .write_begin = bfs_write_begin, 189 .write_begin = bfs_write_begin,
191 .write_end = generic_write_end, 190 .write_end = generic_write_end,
192 .bmap = bfs_bmap, 191 .bmap = bfs_bmap,
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e49cce234c65..9c5e6b2cd11a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
761{ 761{
762 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); 762 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
763 763
764 if (bs->bio_integrity_pool)
765 return 0;
766
764 bs->bio_integrity_pool = 767 bs->bio_integrity_pool =
765 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); 768 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
766 769
diff --git a/fs/bio.c b/fs/bio.c
index 4cf2a52fbc54..4d6d4b6c2bf1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -43,7 +43,7 @@ static mempool_t *bio_split_pool __read_mostly;
43 * unsigned short 43 * unsigned short
44 */ 44 */
45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
46struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 46static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
48}; 48};
49#undef BV 49#undef BV
@@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1636 if (!bs->bio_pool) 1636 if (!bs->bio_pool)
1637 goto bad; 1637 goto bad;
1638 1638
1639 if (bioset_integrity_create(bs, pool_size))
1640 goto bad;
1641
1642 if (!biovec_create_pools(bs, pool_size)) 1639 if (!biovec_create_pools(bs, pool_size))
1643 return bs; 1640 return bs;
1644 1641
@@ -1656,12 +1653,10 @@ static void __init biovec_init_slabs(void)
1656 int size; 1653 int size;
1657 struct biovec_slab *bvs = bvec_slabs + i; 1654 struct biovec_slab *bvs = bvec_slabs + i;
1658 1655
1659#ifndef CONFIG_BLK_DEV_INTEGRITY
1660 if (bvs->nr_vecs <= BIO_INLINE_VECS) { 1656 if (bvs->nr_vecs <= BIO_INLINE_VECS) {
1661 bvs->slab = NULL; 1657 bvs->slab = NULL;
1662 continue; 1658 continue;
1663 } 1659 }
1664#endif
1665 1660
1666 size = bvs->nr_vecs * sizeof(struct bio_vec); 1661 size = bvs->nr_vecs * sizeof(struct bio_vec);
1667 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1662 bvs->slab = kmem_cache_create(bvs->name, size, 0,
@@ -1684,6 +1679,9 @@ static int __init init_bio(void)
1684 if (!fs_bio_set) 1679 if (!fs_bio_set)
1685 panic("bio: can't allocate bios\n"); 1680 panic("bio: can't allocate bios\n");
1686 1681
1682 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
1683 panic("bio: can't create integrity pool\n");
1684
1687 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, 1685 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
1688 sizeof(struct bio_pair)); 1686 sizeof(struct bio_pair));
1689 if (!bio_split_pool) 1687 if (!bio_split_pool)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 889287019599..7d02afb2b7f4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1087,6 +1087,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1087 if (!disk) 1087 if (!disk)
1088 goto out; 1088 goto out;
1089 1089
1090 disk_block_events(disk);
1090 mutex_lock_nested(&bdev->bd_mutex, for_part); 1091 mutex_lock_nested(&bdev->bd_mutex, for_part);
1091 if (!bdev->bd_openers) { 1092 if (!bdev->bd_openers) {
1092 bdev->bd_disk = disk; 1093 bdev->bd_disk = disk;
@@ -1108,10 +1109,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1108 */ 1109 */
1109 disk_put_part(bdev->bd_part); 1110 disk_put_part(bdev->bd_part);
1110 bdev->bd_part = NULL; 1111 bdev->bd_part = NULL;
1111 module_put(disk->fops->owner);
1112 put_disk(disk);
1113 bdev->bd_disk = NULL; 1112 bdev->bd_disk = NULL;
1114 mutex_unlock(&bdev->bd_mutex); 1113 mutex_unlock(&bdev->bd_mutex);
1114 disk_unblock_events(disk);
1115 module_put(disk->fops->owner);
1116 put_disk(disk);
1115 goto restart; 1117 goto restart;
1116 } 1118 }
1117 if (ret) 1119 if (ret)
@@ -1148,9 +1150,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1148 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1150 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1149 } 1151 }
1150 } else { 1152 } else {
1151 module_put(disk->fops->owner);
1152 put_disk(disk);
1153 disk = NULL;
1154 if (bdev->bd_contains == bdev) { 1153 if (bdev->bd_contains == bdev) {
1155 if (bdev->bd_disk->fops->open) { 1154 if (bdev->bd_disk->fops->open) {
1156 ret = bdev->bd_disk->fops->open(bdev, mode); 1155 ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1160,11 +1159,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1160 if (bdev->bd_invalidated) 1159 if (bdev->bd_invalidated)
1161 rescan_partitions(bdev->bd_disk, bdev); 1160 rescan_partitions(bdev->bd_disk, bdev);
1162 } 1161 }
1162 /* only one opener holds refs to the module and disk */
1163 module_put(disk->fops->owner);
1164 put_disk(disk);
1163 } 1165 }
1164 bdev->bd_openers++; 1166 bdev->bd_openers++;
1165 if (for_part) 1167 if (for_part)
1166 bdev->bd_part_count++; 1168 bdev->bd_part_count++;
1167 mutex_unlock(&bdev->bd_mutex); 1169 mutex_unlock(&bdev->bd_mutex);
1170 disk_unblock_events(disk);
1168 return 0; 1171 return 0;
1169 1172
1170 out_clear: 1173 out_clear:
@@ -1177,10 +1180,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1177 bdev->bd_contains = NULL; 1180 bdev->bd_contains = NULL;
1178 out_unlock_bdev: 1181 out_unlock_bdev:
1179 mutex_unlock(&bdev->bd_mutex); 1182 mutex_unlock(&bdev->bd_mutex);
1180 out: 1183 disk_unblock_events(disk);
1181 if (disk) 1184 module_put(disk->fops->owner);
1182 module_put(disk->fops->owner);
1183 put_disk(disk); 1185 put_disk(disk);
1186 out:
1184 bdput(bdev); 1187 bdput(bdev);
1185 1188
1186 return ret; 1189 return ret;
@@ -1446,14 +1449,13 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
1446 if (bdev_free) { 1449 if (bdev_free) {
1447 if (bdev->bd_write_holder) { 1450 if (bdev->bd_write_holder) {
1448 disk_unblock_events(bdev->bd_disk); 1451 disk_unblock_events(bdev->bd_disk);
1449 bdev->bd_write_holder = false;
1450 } else
1451 disk_check_events(bdev->bd_disk); 1452 disk_check_events(bdev->bd_disk);
1453 bdev->bd_write_holder = false;
1454 }
1452 } 1455 }
1453 1456
1454 mutex_unlock(&bdev->bd_mutex); 1457 mutex_unlock(&bdev->bd_mutex);
1455 } else 1458 }
1456 disk_check_events(bdev->bd_disk);
1457 1459
1458 return __blkdev_put(bdev, mode, 0); 1460 return __blkdev_put(bdev, mode, 0);
1459} 1461}
@@ -1527,7 +1529,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
1527static const struct address_space_operations def_blk_aops = { 1529static const struct address_space_operations def_blk_aops = {
1528 .readpage = blkdev_readpage, 1530 .readpage = blkdev_readpage,
1529 .writepage = blkdev_writepage, 1531 .writepage = blkdev_writepage,
1530 .sync_page = block_sync_page,
1531 .write_begin = blkdev_write_begin, 1532 .write_begin = blkdev_write_begin,
1532 .write_end = blkdev_write_end, 1533 .write_end = blkdev_write_end,
1533 .writepages = generic_writepages, 1534 .writepages = generic_writepages,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 100b07f021b4..830d261d0e6b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -847,7 +847,6 @@ static const struct address_space_operations btree_aops = {
847 .writepages = btree_writepages, 847 .writepages = btree_writepages,
848 .releasepage = btree_releasepage, 848 .releasepage = btree_releasepage,
849 .invalidatepage = btree_invalidatepage, 849 .invalidatepage = btree_invalidatepage,
850 .sync_page = block_sync_page,
851#ifdef CONFIG_MIGRATION 850#ifdef CONFIG_MIGRATION
852 .migratepage = btree_migratepage, 851 .migratepage = btree_migratepage,
853#endif 852#endif
@@ -1331,82 +1330,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1331} 1330}
1332 1331
1333/* 1332/*
1334 * this unplugs every device on the box, and it is only used when page
1335 * is null
1336 */
1337static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1338{
1339 struct btrfs_device *device;
1340 struct btrfs_fs_info *info;
1341
1342 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1343 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1344 if (!device->bdev)
1345 continue;
1346
1347 bdi = blk_get_backing_dev_info(device->bdev);
1348 if (bdi->unplug_io_fn)
1349 bdi->unplug_io_fn(bdi, page);
1350 }
1351}
1352
1353static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1354{
1355 struct inode *inode;
1356 struct extent_map_tree *em_tree;
1357 struct extent_map *em;
1358 struct address_space *mapping;
1359 u64 offset;
1360
1361 /* the generic O_DIRECT read code does this */
1362 if (1 || !page) {
1363 __unplug_io_fn(bdi, page);
1364 return;
1365 }
1366
1367 /*
1368 * page->mapping may change at any time. Get a consistent copy
1369 * and use that for everything below
1370 */
1371 smp_mb();
1372 mapping = page->mapping;
1373 if (!mapping)
1374 return;
1375
1376 inode = mapping->host;
1377
1378 /*
1379 * don't do the expensive searching for a small number of
1380 * devices
1381 */
1382 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1383 __unplug_io_fn(bdi, page);
1384 return;
1385 }
1386
1387 offset = page_offset(page);
1388
1389 em_tree = &BTRFS_I(inode)->extent_tree;
1390 read_lock(&em_tree->lock);
1391 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1392 read_unlock(&em_tree->lock);
1393 if (!em) {
1394 __unplug_io_fn(bdi, page);
1395 return;
1396 }
1397
1398 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1399 free_extent_map(em);
1400 __unplug_io_fn(bdi, page);
1401 return;
1402 }
1403 offset = offset - em->start;
1404 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1405 em->block_start + offset, page);
1406 free_extent_map(em);
1407}
1408
1409/*
1410 * If this fails, caller must call bdi_destroy() to get rid of the 1333 * If this fails, caller must call bdi_destroy() to get rid of the
1411 * bdi again. 1334 * bdi again.
1412 */ 1335 */
@@ -1420,8 +1343,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1420 return err; 1343 return err;
1421 1344
1422 bdi->ra_pages = default_backing_dev_info.ra_pages; 1345 bdi->ra_pages = default_backing_dev_info.ra_pages;
1423 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1424 bdi->unplug_io_data = info;
1425 bdi->congested_fn = btrfs_congested_fn; 1346 bdi->congested_fn = btrfs_congested_fn;
1426 bdi->congested_data = info; 1347 bdi->congested_data = info;
1427 return 0; 1348 return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 714adc4ac4c2..b5b92824a271 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2188,7 +2188,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2188 unsigned long nr_written = 0; 2188 unsigned long nr_written = 0;
2189 2189
2190 if (wbc->sync_mode == WB_SYNC_ALL) 2190 if (wbc->sync_mode == WB_SYNC_ALL)
2191 write_flags = WRITE_SYNC_PLUG; 2191 write_flags = WRITE_SYNC;
2192 else 2192 else
2193 write_flags = WRITE; 2193 write_flags = WRITE;
2194 2194
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 512c3d1da083..119520bdb9a5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7340,7 +7340,6 @@ static const struct address_space_operations btrfs_aops = {
7340 .writepage = btrfs_writepage, 7340 .writepage = btrfs_writepage,
7341 .writepages = btrfs_writepages, 7341 .writepages = btrfs_writepages,
7342 .readpages = btrfs_readpages, 7342 .readpages = btrfs_readpages,
7343 .sync_page = block_sync_page,
7344 .direct_IO = btrfs_direct_IO, 7343 .direct_IO = btrfs_direct_IO,
7345 .invalidatepage = btrfs_invalidatepage, 7344 .invalidatepage = btrfs_invalidatepage,
7346 .releasepage = btrfs_releasepage, 7345 .releasepage = btrfs_releasepage,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd13eb81ee40..9d554e8e6583 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -162,7 +162,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
162 struct bio *cur; 162 struct bio *cur;
163 int again = 0; 163 int again = 0;
164 unsigned long num_run; 164 unsigned long num_run;
165 unsigned long num_sync_run;
166 unsigned long batch_run = 0; 165 unsigned long batch_run = 0;
167 unsigned long limit; 166 unsigned long limit;
168 unsigned long last_waited = 0; 167 unsigned long last_waited = 0;
@@ -173,11 +172,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
173 limit = btrfs_async_submit_limit(fs_info); 172 limit = btrfs_async_submit_limit(fs_info);
174 limit = limit * 2 / 3; 173 limit = limit * 2 / 3;
175 174
176 /* we want to make sure that every time we switch from the sync
177 * list to the normal list, we unplug
178 */
179 num_sync_run = 0;
180
181loop: 175loop:
182 spin_lock(&device->io_lock); 176 spin_lock(&device->io_lock);
183 177
@@ -223,15 +217,6 @@ loop_lock:
223 217
224 spin_unlock(&device->io_lock); 218 spin_unlock(&device->io_lock);
225 219
226 /*
227 * if we're doing the regular priority list, make sure we unplug
228 * for any high prio bios we've sent down
229 */
230 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
231 num_sync_run = 0;
232 blk_run_backing_dev(bdi, NULL);
233 }
234
235 while (pending) { 220 while (pending) {
236 221
237 rmb(); 222 rmb();
@@ -259,19 +244,11 @@ loop_lock:
259 244
260 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 245 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
261 246
262 if (cur->bi_rw & REQ_SYNC)
263 num_sync_run++;
264
265 submit_bio(cur->bi_rw, cur); 247 submit_bio(cur->bi_rw, cur);
266 num_run++; 248 num_run++;
267 batch_run++; 249 batch_run++;
268 if (need_resched()) { 250 if (need_resched())
269 if (num_sync_run) {
270 blk_run_backing_dev(bdi, NULL);
271 num_sync_run = 0;
272 }
273 cond_resched(); 251 cond_resched();
274 }
275 252
276 /* 253 /*
277 * we made progress, there is more work to do and the bdi 254 * we made progress, there is more work to do and the bdi
@@ -304,13 +281,8 @@ loop_lock:
304 * against it before looping 281 * against it before looping
305 */ 282 */
306 last_waited = ioc->last_waited; 283 last_waited = ioc->last_waited;
307 if (need_resched()) { 284 if (need_resched())
308 if (num_sync_run) {
309 blk_run_backing_dev(bdi, NULL);
310 num_sync_run = 0;
311 }
312 cond_resched(); 285 cond_resched();
313 }
314 continue; 286 continue;
315 } 287 }
316 spin_lock(&device->io_lock); 288 spin_lock(&device->io_lock);
@@ -323,22 +295,6 @@ loop_lock:
323 } 295 }
324 } 296 }
325 297
326 if (num_sync_run) {
327 num_sync_run = 0;
328 blk_run_backing_dev(bdi, NULL);
329 }
330 /*
331 * IO has already been through a long path to get here. Checksumming,
332 * async helper threads, perhaps compression. We've done a pretty
333 * good job of collecting a batch of IO and should just unplug
334 * the device right away.
335 *
336 * This will help anyone who is waiting on the IO, they might have
337 * already unplugged, but managed to do so before the bio they
338 * cared about found its way down here.
339 */
340 blk_run_backing_dev(bdi, NULL);
341
342 cond_resched(); 298 cond_resched();
343 if (again) 299 if (again)
344 goto loop; 300 goto loop;
@@ -2955,7 +2911,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2955static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2911static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2956 u64 logical, u64 *length, 2912 u64 logical, u64 *length,
2957 struct btrfs_multi_bio **multi_ret, 2913 struct btrfs_multi_bio **multi_ret,
2958 int mirror_num, struct page *unplug_page) 2914 int mirror_num)
2959{ 2915{
2960 struct extent_map *em; 2916 struct extent_map *em;
2961 struct map_lookup *map; 2917 struct map_lookup *map;
@@ -2987,11 +2943,6 @@ again:
2987 em = lookup_extent_mapping(em_tree, logical, *length); 2943 em = lookup_extent_mapping(em_tree, logical, *length);
2988 read_unlock(&em_tree->lock); 2944 read_unlock(&em_tree->lock);
2989 2945
2990 if (!em && unplug_page) {
2991 kfree(multi);
2992 return 0;
2993 }
2994
2995 if (!em) { 2946 if (!em) {
2996 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2947 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2997 (unsigned long long)logical, 2948 (unsigned long long)logical,
@@ -3047,13 +2998,13 @@ again:
3047 *length = em->len - offset; 2998 *length = em->len - offset;
3048 } 2999 }
3049 3000
3050 if (!multi_ret && !unplug_page) 3001 if (!multi_ret)
3051 goto out; 3002 goto out;
3052 3003
3053 num_stripes = 1; 3004 num_stripes = 1;
3054 stripe_index = 0; 3005 stripe_index = 0;
3055 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3006 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3056 if (unplug_page || (rw & REQ_WRITE)) 3007 if (rw & REQ_WRITE)
3057 num_stripes = map->num_stripes; 3008 num_stripes = map->num_stripes;
3058 else if (mirror_num) 3009 else if (mirror_num)
3059 stripe_index = mirror_num - 1; 3010 stripe_index = mirror_num - 1;
@@ -3075,7 +3026,7 @@ again:
3075 stripe_index = do_div(stripe_nr, factor); 3026 stripe_index = do_div(stripe_nr, factor);
3076 stripe_index *= map->sub_stripes; 3027 stripe_index *= map->sub_stripes;
3077 3028
3078 if (unplug_page || (rw & REQ_WRITE)) 3029 if (rw & REQ_WRITE)
3079 num_stripes = map->sub_stripes; 3030 num_stripes = map->sub_stripes;
3080 else if (mirror_num) 3031 else if (mirror_num)
3081 stripe_index += mirror_num - 1; 3032 stripe_index += mirror_num - 1;
@@ -3095,22 +3046,10 @@ again:
3095 BUG_ON(stripe_index >= map->num_stripes); 3046 BUG_ON(stripe_index >= map->num_stripes);
3096 3047
3097 for (i = 0; i < num_stripes; i++) { 3048 for (i = 0; i < num_stripes; i++) {
3098 if (unplug_page) { 3049 multi->stripes[i].physical =
3099 struct btrfs_device *device; 3050 map->stripes[stripe_index].physical +
3100 struct backing_dev_info *bdi; 3051 stripe_offset + stripe_nr * map->stripe_len;
3101 3052 multi->stripes[i].dev = map->stripes[stripe_index].dev;
3102 device = map->stripes[stripe_index].dev;
3103 if (device->bdev) {
3104 bdi = blk_get_backing_dev_info(device->bdev);
3105 if (bdi->unplug_io_fn)
3106 bdi->unplug_io_fn(bdi, unplug_page);
3107 }
3108 } else {
3109 multi->stripes[i].physical =
3110 map->stripes[stripe_index].physical +
3111 stripe_offset + stripe_nr * map->stripe_len;
3112 multi->stripes[i].dev = map->stripes[stripe_index].dev;
3113 }
3114 stripe_index++; 3053 stripe_index++;
3115 } 3054 }
3116 if (multi_ret) { 3055 if (multi_ret) {
@@ -3128,7 +3067,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3128 struct btrfs_multi_bio **multi_ret, int mirror_num) 3067 struct btrfs_multi_bio **multi_ret, int mirror_num)
3129{ 3068{
3130 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3069 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3131 mirror_num, NULL); 3070 mirror_num);
3132} 3071}
3133 3072
3134int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3073int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3196,14 +3135,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3196 return 0; 3135 return 0;
3197} 3136}
3198 3137
3199int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
3200 u64 logical, struct page *page)
3201{
3202 u64 length = PAGE_CACHE_SIZE;
3203 return __btrfs_map_block(map_tree, READ, logical, &length,
3204 NULL, 0, page);
3205}
3206
3207static void end_bio_multi_stripe(struct bio *bio, int err) 3138static void end_bio_multi_stripe(struct bio *bio, int err)
3208{ 3139{
3209 struct btrfs_multi_bio *multi = bio->bi_private; 3140 struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2caf..2e6b1a387b7e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,23 +54,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
54} 54}
55EXPORT_SYMBOL(init_buffer); 55EXPORT_SYMBOL(init_buffer);
56 56
57static int sync_buffer(void *word) 57static int sleep_on_buffer(void *word)
58{ 58{
59 struct block_device *bd;
60 struct buffer_head *bh
61 = container_of(word, struct buffer_head, b_state);
62
63 smp_mb();
64 bd = bh->b_bdev;
65 if (bd)
66 blk_run_address_space(bd->bd_inode->i_mapping);
67 io_schedule(); 59 io_schedule();
68 return 0; 60 return 0;
69} 61}
70 62
71void __lock_buffer(struct buffer_head *bh) 63void __lock_buffer(struct buffer_head *bh)
72{ 64{
73 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, 65 wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
74 TASK_UNINTERRUPTIBLE); 66 TASK_UNINTERRUPTIBLE);
75} 67}
76EXPORT_SYMBOL(__lock_buffer); 68EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(unlock_buffer);
90 */ 82 */
91void __wait_on_buffer(struct buffer_head * bh) 83void __wait_on_buffer(struct buffer_head * bh)
92{ 84{
93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 85 wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
94} 86}
95EXPORT_SYMBOL(__wait_on_buffer); 87EXPORT_SYMBOL(__wait_on_buffer);
96 88
@@ -749,10 +741,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
749{ 741{
750 struct buffer_head *bh; 742 struct buffer_head *bh;
751 struct list_head tmp; 743 struct list_head tmp;
752 struct address_space *mapping, *prev_mapping = NULL; 744 struct address_space *mapping;
753 int err = 0, err2; 745 int err = 0, err2;
746 struct blk_plug plug;
754 747
755 INIT_LIST_HEAD(&tmp); 748 INIT_LIST_HEAD(&tmp);
749 blk_start_plug(&plug);
756 750
757 spin_lock(lock); 751 spin_lock(lock);
758 while (!list_empty(list)) { 752 while (!list_empty(list)) {
@@ -775,7 +769,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
775 * still in flight on potentially older 769 * still in flight on potentially older
776 * contents. 770 * contents.
777 */ 771 */
778 write_dirty_buffer(bh, WRITE_SYNC_PLUG); 772 write_dirty_buffer(bh, WRITE_SYNC);
779 773
780 /* 774 /*
781 * Kick off IO for the previous mapping. Note 775 * Kick off IO for the previous mapping. Note
@@ -783,16 +777,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
783 * wait_on_buffer() will do that for us 777 * wait_on_buffer() will do that for us
784 * through sync_buffer(). 778 * through sync_buffer().
785 */ 779 */
786 if (prev_mapping && prev_mapping != mapping)
787 blk_run_address_space(prev_mapping);
788 prev_mapping = mapping;
789
790 brelse(bh); 780 brelse(bh);
791 spin_lock(lock); 781 spin_lock(lock);
792 } 782 }
793 } 783 }
794 } 784 }
795 785
786 spin_unlock(lock);
787 blk_finish_plug(&plug);
788 spin_lock(lock);
789
796 while (!list_empty(&tmp)) { 790 while (!list_empty(&tmp)) {
797 bh = BH_ENTRY(tmp.prev); 791 bh = BH_ENTRY(tmp.prev);
798 get_bh(bh); 792 get_bh(bh);
@@ -1614,14 +1608,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1614 * prevents this contention from occurring. 1608 * prevents this contention from occurring.
1615 * 1609 *
1616 * If block_write_full_page() is called with wbc->sync_mode == 1610 * If block_write_full_page() is called with wbc->sync_mode ==
1617 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this 1611 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1618 * causes the writes to be flagged as synchronous writes, but the 1612 * causes the writes to be flagged as synchronous writes.
1619 * block device queue will NOT be unplugged, since usually many pages
1620 * will be pushed to the out before the higher-level caller actually
1621 * waits for the writes to be completed. The various wait functions,
1622 * such as wait_on_writeback_range() will ultimately call sync_page()
1623 * which will ultimately call blk_run_backing_dev(), which will end up
1624 * unplugging the device queue.
1625 */ 1613 */
1626static int __block_write_full_page(struct inode *inode, struct page *page, 1614static int __block_write_full_page(struct inode *inode, struct page *page,
1627 get_block_t *get_block, struct writeback_control *wbc, 1615 get_block_t *get_block, struct writeback_control *wbc,
@@ -1634,7 +1622,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1634 const unsigned blocksize = 1 << inode->i_blkbits; 1622 const unsigned blocksize = 1 << inode->i_blkbits;
1635 int nr_underway = 0; 1623 int nr_underway = 0;
1636 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? 1624 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1637 WRITE_SYNC_PLUG : WRITE); 1625 WRITE_SYNC : WRITE);
1638 1626
1639 BUG_ON(!PageLocked(page)); 1627 BUG_ON(!PageLocked(page));
1640 1628
@@ -3138,17 +3126,6 @@ out:
3138} 3126}
3139EXPORT_SYMBOL(try_to_free_buffers); 3127EXPORT_SYMBOL(try_to_free_buffers);
3140 3128
3141void block_sync_page(struct page *page)
3142{
3143 struct address_space *mapping;
3144
3145 smp_mb();
3146 mapping = page_mapping(page);
3147 if (mapping)
3148 blk_run_backing_dev(mapping->backing_dev_info, page);
3149}
3150EXPORT_SYMBOL(block_sync_page);
3151
3152/* 3129/*
3153 * There are no bdflush tunables left. But distributions are 3130 * There are no bdflush tunables left. But distributions are
3154 * still running obsolete flush daemons, so we terminate them here. 3131 * still running obsolete flush daemons, so we terminate them here.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e964b1cd5dd0..c27d236738fc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1569,34 +1569,6 @@ int cifs_fsync(struct file *file, int datasync)
1569 return rc; 1569 return rc;
1570} 1570}
1571 1571
1572/* static void cifs_sync_page(struct page *page)
1573{
1574 struct address_space *mapping;
1575 struct inode *inode;
1576 unsigned long index = page->index;
1577 unsigned int rpages = 0;
1578 int rc = 0;
1579
1580 cFYI(1, "sync page %p", page);
1581 mapping = page->mapping;
1582 if (!mapping)
1583 return 0;
1584 inode = mapping->host;
1585 if (!inode)
1586 return; */
1587
1588/* fill in rpages then
1589 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
1590
1591/* cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
1592
1593#if 0
1594 if (rc < 0)
1595 return rc;
1596 return 0;
1597#endif
1598} */
1599
1600/* 1572/*
1601 * As file closes, flush all cached write data for this inode checking 1573 * As file closes, flush all cached write data for this inode checking
1602 * for write behind errors. 1574 * for write behind errors.
@@ -2510,7 +2482,6 @@ const struct address_space_operations cifs_addr_ops = {
2510 .set_page_dirty = __set_page_dirty_nobuffers, 2482 .set_page_dirty = __set_page_dirty_nobuffers,
2511 .releasepage = cifs_release_page, 2483 .releasepage = cifs_release_page,
2512 .invalidatepage = cifs_invalidate_page, 2484 .invalidatepage = cifs_invalidate_page,
2513 /* .sync_page = cifs_sync_page, */
2514 /* .direct_IO = */ 2485 /* .direct_IO = */
2515}; 2486};
2516 2487
@@ -2528,6 +2499,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2528 .set_page_dirty = __set_page_dirty_nobuffers, 2499 .set_page_dirty = __set_page_dirty_nobuffers,
2529 .releasepage = cifs_release_page, 2500 .releasepage = cifs_release_page,
2530 .invalidatepage = cifs_invalidate_page, 2501 .invalidatepage = cifs_invalidate_page,
2531 /* .sync_page = cifs_sync_page, */
2532 /* .direct_IO = */ 2502 /* .direct_IO = */
2533}; 2503};
diff --git a/fs/direct-io.c b/fs/direct-io.c
index dcb5577cde1d..ac5f164170e3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1110,11 +1110,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1110 ((rw & READ) || (dio->result == dio->size))) 1110 ((rw & READ) || (dio->result == dio->size)))
1111 ret = -EIOCBQUEUED; 1111 ret = -EIOCBQUEUED;
1112 1112
1113 if (ret != -EIOCBQUEUED) { 1113 if (ret != -EIOCBQUEUED)
1114 /* All IO is now issued, send it on its way */
1115 blk_run_address_space(inode->i_mapping);
1116 dio_await_completion(dio); 1114 dio_await_completion(dio);
1117 }
1118 1115
1119 /* 1116 /*
1120 * Sync will always be dropping the final ref and completing the 1117 * Sync will always be dropping the final ref and completing the
@@ -1176,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1176 struct dio *dio; 1173 struct dio *dio;
1177 1174
1178 if (rw & WRITE) 1175 if (rw & WRITE)
1179 rw = WRITE_ODIRECT_PLUG; 1176 rw = WRITE_ODIRECT;
1180 1177
1181 if (bdev) 1178 if (bdev)
1182 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1179 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index a8e7797b9477..9c13412e6c99 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -23,7 +23,6 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
23} 23}
24static const struct address_space_operations efs_aops = { 24static const struct address_space_operations efs_aops = {
25 .readpage = efs_readpage, 25 .readpage = efs_readpage,
26 .sync_page = block_sync_page,
27 .bmap = _efs_bmap 26 .bmap = _efs_bmap
28}; 27};
29 28
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 0c713cfbebf0..8472c098445d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -823,7 +823,6 @@ const struct address_space_operations exofs_aops = {
823 .direct_IO = NULL, /* TODO: Should be trivial to do */ 823 .direct_IO = NULL, /* TODO: Should be trivial to do */
824 824
825 /* With these NULL has special meaning or default is not exported */ 825 /* With these NULL has special meaning or default is not exported */
826 .sync_page = NULL,
827 .get_xip_mem = NULL, 826 .get_xip_mem = NULL,
828 .migratepage = NULL, 827 .migratepage = NULL,
829 .launder_page = NULL, 828 .launder_page = NULL,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 40ad210a5049..c47f706878b5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -860,7 +860,6 @@ const struct address_space_operations ext2_aops = {
860 .readpage = ext2_readpage, 860 .readpage = ext2_readpage,
861 .readpages = ext2_readpages, 861 .readpages = ext2_readpages,
862 .writepage = ext2_writepage, 862 .writepage = ext2_writepage,
863 .sync_page = block_sync_page,
864 .write_begin = ext2_write_begin, 863 .write_begin = ext2_write_begin,
865 .write_end = ext2_write_end, 864 .write_end = ext2_write_end,
866 .bmap = ext2_bmap, 865 .bmap = ext2_bmap,
@@ -880,7 +879,6 @@ const struct address_space_operations ext2_nobh_aops = {
880 .readpage = ext2_readpage, 879 .readpage = ext2_readpage,
881 .readpages = ext2_readpages, 880 .readpages = ext2_readpages,
882 .writepage = ext2_nobh_writepage, 881 .writepage = ext2_nobh_writepage,
883 .sync_page = block_sync_page,
884 .write_begin = ext2_nobh_write_begin, 882 .write_begin = ext2_nobh_write_begin,
885 .write_end = nobh_write_end, 883 .write_end = nobh_write_end,
886 .bmap = ext2_bmap, 884 .bmap = ext2_bmap,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ae94f6d949f5..fe2541d250e4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1894,7 +1894,6 @@ static const struct address_space_operations ext3_ordered_aops = {
1894 .readpage = ext3_readpage, 1894 .readpage = ext3_readpage,
1895 .readpages = ext3_readpages, 1895 .readpages = ext3_readpages,
1896 .writepage = ext3_ordered_writepage, 1896 .writepage = ext3_ordered_writepage,
1897 .sync_page = block_sync_page,
1898 .write_begin = ext3_write_begin, 1897 .write_begin = ext3_write_begin,
1899 .write_end = ext3_ordered_write_end, 1898 .write_end = ext3_ordered_write_end,
1900 .bmap = ext3_bmap, 1899 .bmap = ext3_bmap,
@@ -1910,7 +1909,6 @@ static const struct address_space_operations ext3_writeback_aops = {
1910 .readpage = ext3_readpage, 1909 .readpage = ext3_readpage,
1911 .readpages = ext3_readpages, 1910 .readpages = ext3_readpages,
1912 .writepage = ext3_writeback_writepage, 1911 .writepage = ext3_writeback_writepage,
1913 .sync_page = block_sync_page,
1914 .write_begin = ext3_write_begin, 1912 .write_begin = ext3_write_begin,
1915 .write_end = ext3_writeback_write_end, 1913 .write_end = ext3_writeback_write_end,
1916 .bmap = ext3_bmap, 1914 .bmap = ext3_bmap,
@@ -1926,7 +1924,6 @@ static const struct address_space_operations ext3_journalled_aops = {
1926 .readpage = ext3_readpage, 1924 .readpage = ext3_readpage,
1927 .readpages = ext3_readpages, 1925 .readpages = ext3_readpages,
1928 .writepage = ext3_journalled_writepage, 1926 .writepage = ext3_journalled_writepage,
1929 .sync_page = block_sync_page,
1930 .write_begin = ext3_write_begin, 1927 .write_begin = ext3_write_begin,
1931 .write_end = ext3_journalled_write_end, 1928 .write_end = ext3_journalled_write_end,
1932 .set_page_dirty = ext3_journalled_set_page_dirty, 1929 .set_page_dirty = ext3_journalled_set_page_dirty,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..9297ad46c465 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3903,7 +3903,6 @@ static const struct address_space_operations ext4_ordered_aops = {
3903 .readpage = ext4_readpage, 3903 .readpage = ext4_readpage,
3904 .readpages = ext4_readpages, 3904 .readpages = ext4_readpages,
3905 .writepage = ext4_writepage, 3905 .writepage = ext4_writepage,
3906 .sync_page = block_sync_page,
3907 .write_begin = ext4_write_begin, 3906 .write_begin = ext4_write_begin,
3908 .write_end = ext4_ordered_write_end, 3907 .write_end = ext4_ordered_write_end,
3909 .bmap = ext4_bmap, 3908 .bmap = ext4_bmap,
@@ -3919,7 +3918,6 @@ static const struct address_space_operations ext4_writeback_aops = {
3919 .readpage = ext4_readpage, 3918 .readpage = ext4_readpage,
3920 .readpages = ext4_readpages, 3919 .readpages = ext4_readpages,
3921 .writepage = ext4_writepage, 3920 .writepage = ext4_writepage,
3922 .sync_page = block_sync_page,
3923 .write_begin = ext4_write_begin, 3921 .write_begin = ext4_write_begin,
3924 .write_end = ext4_writeback_write_end, 3922 .write_end = ext4_writeback_write_end,
3925 .bmap = ext4_bmap, 3923 .bmap = ext4_bmap,
@@ -3935,7 +3933,6 @@ static const struct address_space_operations ext4_journalled_aops = {
3935 .readpage = ext4_readpage, 3933 .readpage = ext4_readpage,
3936 .readpages = ext4_readpages, 3934 .readpages = ext4_readpages,
3937 .writepage = ext4_writepage, 3935 .writepage = ext4_writepage,
3938 .sync_page = block_sync_page,
3939 .write_begin = ext4_write_begin, 3936 .write_begin = ext4_write_begin,
3940 .write_end = ext4_journalled_write_end, 3937 .write_end = ext4_journalled_write_end,
3941 .set_page_dirty = ext4_journalled_set_page_dirty, 3938 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3951,7 +3948,6 @@ static const struct address_space_operations ext4_da_aops = {
3951 .readpages = ext4_readpages, 3948 .readpages = ext4_readpages,
3952 .writepage = ext4_writepage, 3949 .writepage = ext4_writepage,
3953 .writepages = ext4_da_writepages, 3950 .writepages = ext4_da_writepages,
3954 .sync_page = block_sync_page,
3955 .write_begin = ext4_da_write_begin, 3951 .write_begin = ext4_da_write_begin,
3956 .write_end = ext4_da_write_end, 3952 .write_end = ext4_da_write_end,
3957 .bmap = ext4_bmap, 3953 .bmap = ext4_bmap,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 955cc309142f..e2cd90e4bb7c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -310,8 +310,7 @@ static int io_submit_init(struct ext4_io_submit *io,
310 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 310 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
311 311
312 io->io_bio = bio; 312 io->io_bio = bio;
313 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? 313 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
314 WRITE_SYNC_PLUG : WRITE);
315 io->io_next_block = bh->b_blocknr; 314 io->io_next_block = bh->b_blocknr;
316 return 0; 315 return 0;
317} 316}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0e277ec4b612..8d68690bdcf1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -236,7 +236,6 @@ static const struct address_space_operations fat_aops = {
236 .readpages = fat_readpages, 236 .readpages = fat_readpages,
237 .writepage = fat_writepage, 237 .writepage = fat_writepage,
238 .writepages = fat_writepages, 238 .writepages = fat_writepages,
239 .sync_page = block_sync_page,
240 .write_begin = fat_write_begin, 239 .write_begin = fat_write_begin,
241 .write_end = fat_write_end, 240 .write_end = fat_write_end,
242 .direct_IO = fat_direct_IO, 241 .direct_IO = fat_direct_IO,
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 1429f3ae1e86..5d318c44f855 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -44,7 +44,6 @@ static sector_t vxfs_bmap(struct address_space *, sector_t);
44const struct address_space_operations vxfs_aops = { 44const struct address_space_operations vxfs_aops = {
45 .readpage = vxfs_readpage, 45 .readpage = vxfs_readpage,
46 .bmap = vxfs_bmap, 46 .bmap = vxfs_bmap,
47 .sync_page = block_sync_page,
48}; 47};
49 48
50inline void 49inline void
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 051b1a084528..cc6ec4b2f0ff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -870,7 +870,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
870 870
871 fc->bdi.name = "fuse"; 871 fc->bdi.name = "fuse";
872 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 872 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
873 fc->bdi.unplug_io_fn = default_unplug_io_fn;
874 /* fuse does it's own writeback accounting */ 873 /* fuse does it's own writeback accounting */
875 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 874 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
876 875
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index aad77e4f61b5..c71995b111bf 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1117,7 +1117,6 @@ static const struct address_space_operations gfs2_writeback_aops = {
1117 .writepages = gfs2_writeback_writepages, 1117 .writepages = gfs2_writeback_writepages,
1118 .readpage = gfs2_readpage, 1118 .readpage = gfs2_readpage,
1119 .readpages = gfs2_readpages, 1119 .readpages = gfs2_readpages,
1120 .sync_page = block_sync_page,
1121 .write_begin = gfs2_write_begin, 1120 .write_begin = gfs2_write_begin,
1122 .write_end = gfs2_write_end, 1121 .write_end = gfs2_write_end,
1123 .bmap = gfs2_bmap, 1122 .bmap = gfs2_bmap,
@@ -1133,7 +1132,6 @@ static const struct address_space_operations gfs2_ordered_aops = {
1133 .writepage = gfs2_ordered_writepage, 1132 .writepage = gfs2_ordered_writepage,
1134 .readpage = gfs2_readpage, 1133 .readpage = gfs2_readpage,
1135 .readpages = gfs2_readpages, 1134 .readpages = gfs2_readpages,
1136 .sync_page = block_sync_page,
1137 .write_begin = gfs2_write_begin, 1135 .write_begin = gfs2_write_begin,
1138 .write_end = gfs2_write_end, 1136 .write_end = gfs2_write_end,
1139 .set_page_dirty = gfs2_set_page_dirty, 1137 .set_page_dirty = gfs2_set_page_dirty,
@@ -1151,7 +1149,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
1151 .writepages = gfs2_jdata_writepages, 1149 .writepages = gfs2_jdata_writepages,
1152 .readpage = gfs2_readpage, 1150 .readpage = gfs2_readpage,
1153 .readpages = gfs2_readpages, 1151 .readpages = gfs2_readpages,
1154 .sync_page = block_sync_page,
1155 .write_begin = gfs2_write_begin, 1152 .write_begin = gfs2_write_begin,
1156 .write_end = gfs2_write_end, 1153 .write_end = gfs2_write_end,
1157 .set_page_dirty = gfs2_set_page_dirty, 1154 .set_page_dirty = gfs2_set_page_dirty,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e7ed31f858dd..5b102c1887fd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -121,7 +121,7 @@ __acquires(&sdp->sd_ail_lock)
121 lock_buffer(bh); 121 lock_buffer(bh);
122 if (test_clear_buffer_dirty(bh)) { 122 if (test_clear_buffer_dirty(bh)) {
123 bh->b_end_io = end_buffer_write_sync; 123 bh->b_end_io = end_buffer_write_sync;
124 submit_bh(WRITE_SYNC_PLUG, bh); 124 submit_bh(WRITE_SYNC, bh);
125 } else { 125 } else {
126 unlock_buffer(bh); 126 unlock_buffer(bh);
127 brelse(bh); 127 brelse(bh);
@@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
647 lock_buffer(bh); 647 lock_buffer(bh);
648 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { 648 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
649 bh->b_end_io = end_buffer_write_sync; 649 bh->b_end_io = end_buffer_write_sync;
650 submit_bh(WRITE_SYNC_PLUG, bh); 650 submit_bh(WRITE_SYNC, bh);
651 } else { 651 } else {
652 unlock_buffer(bh); 652 unlock_buffer(bh);
653 brelse(bh); 653 brelse(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index e919abf25ecd..51d27f00ebb4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -204,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
204 } 204 }
205 205
206 gfs2_log_unlock(sdp); 206 gfs2_log_unlock(sdp);
207 submit_bh(WRITE_SYNC_PLUG, bh); 207 submit_bh(WRITE_SYNC, bh);
208 gfs2_log_lock(sdp); 208 gfs2_log_lock(sdp);
209 209
210 n = 0; 210 n = 0;
@@ -214,7 +214,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
214 gfs2_log_unlock(sdp); 214 gfs2_log_unlock(sdp);
215 lock_buffer(bd2->bd_bh); 215 lock_buffer(bd2->bd_bh);
216 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); 216 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
217 submit_bh(WRITE_SYNC_PLUG, bh); 217 submit_bh(WRITE_SYNC, bh);
218 gfs2_log_lock(sdp); 218 gfs2_log_lock(sdp);
219 if (++n >= num) 219 if (++n >= num)
220 break; 220 break;
@@ -356,7 +356,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
356 sdp->sd_log_num_revoke--; 356 sdp->sd_log_num_revoke--;
357 357
358 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { 358 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
359 submit_bh(WRITE_SYNC_PLUG, bh); 359 submit_bh(WRITE_SYNC, bh);
360 360
361 bh = gfs2_log_get_buf(sdp); 361 bh = gfs2_log_get_buf(sdp);
362 mh = (struct gfs2_meta_header *)bh->b_data; 362 mh = (struct gfs2_meta_header *)bh->b_data;
@@ -373,7 +373,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
373 } 373 }
374 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 374 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
375 375
376 submit_bh(WRITE_SYNC_PLUG, bh); 376 submit_bh(WRITE_SYNC, bh);
377} 377}
378 378
379static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 379static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -575,7 +575,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
575 ptr = bh_log_ptr(bh); 575 ptr = bh_log_ptr(bh);
576 576
577 get_bh(bh); 577 get_bh(bh);
578 submit_bh(WRITE_SYNC_PLUG, bh); 578 submit_bh(WRITE_SYNC, bh);
579 gfs2_log_lock(sdp); 579 gfs2_log_lock(sdp);
580 while(!list_empty(list)) { 580 while(!list_empty(list)) {
581 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); 581 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -601,7 +601,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
601 } else { 601 } else {
602 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); 602 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
603 } 603 }
604 submit_bh(WRITE_SYNC_PLUG, bh1); 604 submit_bh(WRITE_SYNC, bh1);
605 gfs2_log_lock(sdp); 605 gfs2_log_lock(sdp);
606 ptr += 2; 606 ptr += 2;
607 } 607 }
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 01d97f486553..675349b5a133 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
37 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
38 int nr_underway = 0; 38 int nr_underway = 0;
39 int write_op = REQ_META | 39 int write_op = REQ_META |
40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); 40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
41 41
42 BUG_ON(!PageLocked(page)); 42 BUG_ON(!PageLocked(page));
43 BUG_ON(!page_has_buffers(page)); 43 BUG_ON(!page_has_buffers(page));
@@ -94,7 +94,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
94const struct address_space_operations gfs2_meta_aops = { 94const struct address_space_operations gfs2_meta_aops = {
95 .writepage = gfs2_aspace_writepage, 95 .writepage = gfs2_aspace_writepage,
96 .releasepage = gfs2_releasepage, 96 .releasepage = gfs2_releasepage,
97 .sync_page = block_sync_page,
98}; 97};
99 98
100/** 99/**
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dffb4e996643..fff16c968e67 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,6 @@ static int hfs_writepages(struct address_space *mapping,
150const struct address_space_operations hfs_btree_aops = { 150const struct address_space_operations hfs_btree_aops = {
151 .readpage = hfs_readpage, 151 .readpage = hfs_readpage,
152 .writepage = hfs_writepage, 152 .writepage = hfs_writepage,
153 .sync_page = block_sync_page,
154 .write_begin = hfs_write_begin, 153 .write_begin = hfs_write_begin,
155 .write_end = generic_write_end, 154 .write_end = generic_write_end,
156 .bmap = hfs_bmap, 155 .bmap = hfs_bmap,
@@ -160,7 +159,6 @@ const struct address_space_operations hfs_btree_aops = {
160const struct address_space_operations hfs_aops = { 159const struct address_space_operations hfs_aops = {
161 .readpage = hfs_readpage, 160 .readpage = hfs_readpage,
162 .writepage = hfs_writepage, 161 .writepage = hfs_writepage,
163 .sync_page = block_sync_page,
164 .write_begin = hfs_write_begin, 162 .write_begin = hfs_write_begin,
165 .write_end = generic_write_end, 163 .write_end = generic_write_end,
166 .bmap = hfs_bmap, 164 .bmap = hfs_bmap,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a8df651747f0..b248a6cfcad9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -146,7 +146,6 @@ static int hfsplus_writepages(struct address_space *mapping,
146const struct address_space_operations hfsplus_btree_aops = { 146const struct address_space_operations hfsplus_btree_aops = {
147 .readpage = hfsplus_readpage, 147 .readpage = hfsplus_readpage,
148 .writepage = hfsplus_writepage, 148 .writepage = hfsplus_writepage,
149 .sync_page = block_sync_page,
150 .write_begin = hfsplus_write_begin, 149 .write_begin = hfsplus_write_begin,
151 .write_end = generic_write_end, 150 .write_end = generic_write_end,
152 .bmap = hfsplus_bmap, 151 .bmap = hfsplus_bmap,
@@ -156,7 +155,6 @@ const struct address_space_operations hfsplus_btree_aops = {
156const struct address_space_operations hfsplus_aops = { 155const struct address_space_operations hfsplus_aops = {
157 .readpage = hfsplus_readpage, 156 .readpage = hfsplus_readpage,
158 .writepage = hfsplus_writepage, 157 .writepage = hfsplus_writepage,
159 .sync_page = block_sync_page,
160 .write_begin = hfsplus_write_begin, 158 .write_begin = hfsplus_write_begin,
161 .write_end = generic_write_end, 159 .write_end = generic_write_end,
162 .bmap = hfsplus_bmap, 160 .bmap = hfsplus_bmap,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 2dbae20450f8..9b9eb6933e43 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -119,7 +119,6 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
119const struct address_space_operations hpfs_aops = { 119const struct address_space_operations hpfs_aops = {
120 .readpage = hpfs_readpage, 120 .readpage = hpfs_readpage,
121 .writepage = hpfs_writepage, 121 .writepage = hpfs_writepage,
122 .sync_page = block_sync_page,
123 .write_begin = hpfs_write_begin, 122 .write_begin = hpfs_write_begin,
124 .write_end = generic_write_end, 123 .write_end = generic_write_end,
125 .bmap = _hpfs_bmap 124 .bmap = _hpfs_bmap
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a0f3833c0dbf..3db5ba4568fc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1158,7 +1158,6 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
1158 1158
1159static const struct address_space_operations isofs_aops = { 1159static const struct address_space_operations isofs_aops = {
1160 .readpage = isofs_readpage, 1160 .readpage = isofs_readpage,
1161 .sync_page = block_sync_page,
1162 .bmap = _isofs_bmap 1161 .bmap = _isofs_bmap
1163}; 1162};
1164 1163
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 34a4861c14b8..da871ee084d3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/blkdev.h>
23 24
24/* 25/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 26 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -294,7 +295,7 @@ void journal_commit_transaction(journal_t *journal)
294 int first_tag = 0; 295 int first_tag = 0;
295 int tag_flag; 296 int tag_flag;
296 int i; 297 int i;
297 int write_op = WRITE_SYNC; 298 struct blk_plug plug;
298 299
299 /* 300 /*
300 * First job: lock down the current transaction and wait for 301 * First job: lock down the current transaction and wait for
@@ -327,13 +328,6 @@ void journal_commit_transaction(journal_t *journal)
327 spin_lock(&journal->j_state_lock); 328 spin_lock(&journal->j_state_lock);
328 commit_transaction->t_state = T_LOCKED; 329 commit_transaction->t_state = T_LOCKED;
329 330
330 /*
331 * Use plugged writes here, since we want to submit several before
332 * we unplug the device. We don't do explicit unplugging in here,
333 * instead we rely on sync_buffer() doing the unplug for us.
334 */
335 if (commit_transaction->t_synchronous_commit)
336 write_op = WRITE_SYNC_PLUG;
337 spin_lock(&commit_transaction->t_handle_lock); 331 spin_lock(&commit_transaction->t_handle_lock);
338 while (commit_transaction->t_updates) { 332 while (commit_transaction->t_updates) {
339 DEFINE_WAIT(wait); 333 DEFINE_WAIT(wait);
@@ -418,8 +412,10 @@ void journal_commit_transaction(journal_t *journal)
418 * Now start flushing things to disk, in the order they appear 412 * Now start flushing things to disk, in the order they appear
419 * on the transaction lists. Data blocks go first. 413 * on the transaction lists. Data blocks go first.
420 */ 414 */
415 blk_start_plug(&plug);
421 err = journal_submit_data_buffers(journal, commit_transaction, 416 err = journal_submit_data_buffers(journal, commit_transaction,
422 write_op); 417 WRITE_SYNC);
418 blk_finish_plug(&plug);
423 419
424 /* 420 /*
425 * Wait for all previously submitted IO to complete. 421 * Wait for all previously submitted IO to complete.
@@ -480,7 +476,9 @@ void journal_commit_transaction(journal_t *journal)
480 err = 0; 476 err = 0;
481 } 477 }
482 478
483 journal_write_revoke_records(journal, commit_transaction, write_op); 479 blk_start_plug(&plug);
480
481 journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
484 482
485 /* 483 /*
486 * If we found any dirty or locked buffers, then we should have 484 * If we found any dirty or locked buffers, then we should have
@@ -650,7 +648,7 @@ start_journal_io:
650 clear_buffer_dirty(bh); 648 clear_buffer_dirty(bh);
651 set_buffer_uptodate(bh); 649 set_buffer_uptodate(bh);
652 bh->b_end_io = journal_end_buffer_io_sync; 650 bh->b_end_io = journal_end_buffer_io_sync;
653 submit_bh(write_op, bh); 651 submit_bh(WRITE_SYNC, bh);
654 } 652 }
655 cond_resched(); 653 cond_resched();
656 654
@@ -661,6 +659,8 @@ start_journal_io:
661 } 659 }
662 } 660 }
663 661
662 blk_finish_plug(&plug);
663
664 /* Lo and behold: we have just managed to send a transaction to 664 /* Lo and behold: we have just managed to send a transaction to
665 the log. Before we can commit it, wait for the IO so far to 665 the log. Before we can commit it, wait for the IO so far to
666 complete. Control buffers being written are on the 666 complete. Control buffers being written are on the
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f3ad1598b201..fa36d7662b21 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -137,9 +137,9 @@ static int journal_submit_commit_record(journal_t *journal,
137 if (journal->j_flags & JBD2_BARRIER && 137 if (journal->j_flags & JBD2_BARRIER &&
138 !JBD2_HAS_INCOMPAT_FEATURE(journal, 138 !JBD2_HAS_INCOMPAT_FEATURE(journal,
139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) 139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
140 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh); 140 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
141 else 141 else
142 ret = submit_bh(WRITE_SYNC_PLUG, bh); 142 ret = submit_bh(WRITE_SYNC, bh);
143 143
144 *cbh = bh; 144 *cbh = bh;
145 return ret; 145 return ret;
@@ -329,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
329 int tag_bytes = journal_tag_bytes(journal); 329 int tag_bytes = journal_tag_bytes(journal);
330 struct buffer_head *cbh = NULL; /* For transactional checksums */ 330 struct buffer_head *cbh = NULL; /* For transactional checksums */
331 __u32 crc32_sum = ~0; 331 __u32 crc32_sum = ~0;
332 int write_op = WRITE_SYNC; 332 struct blk_plug plug;
333 333
334 /* 334 /*
335 * First job: lock down the current transaction and wait for 335 * First job: lock down the current transaction and wait for
@@ -363,13 +363,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
363 write_lock(&journal->j_state_lock); 363 write_lock(&journal->j_state_lock);
364 commit_transaction->t_state = T_LOCKED; 364 commit_transaction->t_state = T_LOCKED;
365 365
366 /*
367 * Use plugged writes here, since we want to submit several before
368 * we unplug the device. We don't do explicit unplugging in here,
369 * instead we rely on sync_buffer() doing the unplug for us.
370 */
371 if (commit_transaction->t_synchronous_commit)
372 write_op = WRITE_SYNC_PLUG;
373 trace_jbd2_commit_locking(journal, commit_transaction); 366 trace_jbd2_commit_locking(journal, commit_transaction);
374 stats.run.rs_wait = commit_transaction->t_max_wait; 367 stats.run.rs_wait = commit_transaction->t_max_wait;
375 stats.run.rs_locked = jiffies; 368 stats.run.rs_locked = jiffies;
@@ -469,8 +462,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
469 if (err) 462 if (err)
470 jbd2_journal_abort(journal, err); 463 jbd2_journal_abort(journal, err);
471 464
465 blk_start_plug(&plug);
472 jbd2_journal_write_revoke_records(journal, commit_transaction, 466 jbd2_journal_write_revoke_records(journal, commit_transaction,
473 write_op); 467 WRITE_SYNC);
468 blk_finish_plug(&plug);
474 469
475 jbd_debug(3, "JBD: commit phase 2\n"); 470 jbd_debug(3, "JBD: commit phase 2\n");
476 471
@@ -497,6 +492,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
497 err = 0; 492 err = 0;
498 descriptor = NULL; 493 descriptor = NULL;
499 bufs = 0; 494 bufs = 0;
495 blk_start_plug(&plug);
500 while (commit_transaction->t_buffers) { 496 while (commit_transaction->t_buffers) {
501 497
502 /* Find the next buffer to be journaled... */ 498 /* Find the next buffer to be journaled... */
@@ -658,7 +654,7 @@ start_journal_io:
658 clear_buffer_dirty(bh); 654 clear_buffer_dirty(bh);
659 set_buffer_uptodate(bh); 655 set_buffer_uptodate(bh);
660 bh->b_end_io = journal_end_buffer_io_sync; 656 bh->b_end_io = journal_end_buffer_io_sync;
661 submit_bh(write_op, bh); 657 submit_bh(WRITE_SYNC, bh);
662 } 658 }
663 cond_resched(); 659 cond_resched();
664 stats.run.rs_blocks_logged += bufs; 660 stats.run.rs_blocks_logged += bufs;
@@ -699,6 +695,8 @@ start_journal_io:
699 __jbd2_journal_abort_hard(journal); 695 __jbd2_journal_abort_hard(journal);
700 } 696 }
701 697
698 blk_finish_plug(&plug);
699
702 /* Lo and behold: we have just managed to send a transaction to 700 /* Lo and behold: we have just managed to send a transaction to
703 the log. Before we can commit it, wait for the IO so far to 701 the log. Before we can commit it, wait for the IO so far to
704 complete. Control buffers being written are on the 702 complete. Control buffers being written are on the
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9978803ceedc..eddbb373209e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -352,7 +352,6 @@ const struct address_space_operations jfs_aops = {
352 .readpages = jfs_readpages, 352 .readpages = jfs_readpages,
353 .writepage = jfs_writepage, 353 .writepage = jfs_writepage,
354 .writepages = jfs_writepages, 354 .writepages = jfs_writepages,
355 .sync_page = block_sync_page,
356 .write_begin = jfs_write_begin, 355 .write_begin = jfs_write_begin,
357 .write_end = nobh_write_end, 356 .write_end = nobh_write_end,
358 .bmap = jfs_bmap, 357 .bmap = jfs_bmap,
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 48b44bd8267b..6740d34cd82b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -583,7 +583,6 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
583const struct address_space_operations jfs_metapage_aops = { 583const struct address_space_operations jfs_metapage_aops = {
584 .readpage = metapage_readpage, 584 .readpage = metapage_readpage,
585 .writepage = metapage_writepage, 585 .writepage = metapage_writepage,
586 .sync_page = block_sync_page,
587 .releasepage = metapage_releasepage, 586 .releasepage = metapage_releasepage,
588 .invalidatepage = metapage_invalidatepage, 587 .invalidatepage = metapage_invalidatepage,
589 .set_page_dirty = __set_page_dirty_nobuffers, 588 .set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 723bc5bca09a..1adc8d455f0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -39,7 +39,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
39 bio.bi_end_io = request_complete; 39 bio.bi_end_io = request_complete;
40 40
41 submit_bio(rw, &bio); 41 submit_bio(rw, &bio);
42 generic_unplug_device(bdev_get_queue(bdev));
43 wait_for_completion(&complete); 42 wait_for_completion(&complete);
44 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; 43 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
45} 44}
@@ -168,7 +167,6 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
168 } 167 }
169 len = PAGE_ALIGN(len); 168 len = PAGE_ALIGN(len);
170 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); 169 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
171 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
172} 170}
173 171
174 172
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ae0b83f476a6..adcdc0a4e182 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -399,7 +399,6 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
399static const struct address_space_operations minix_aops = { 399static const struct address_space_operations minix_aops = {
400 .readpage = minix_readpage, 400 .readpage = minix_readpage,
401 .writepage = minix_writepage, 401 .writepage = minix_writepage,
402 .sync_page = block_sync_page,
403 .write_begin = minix_write_begin, 402 .write_begin = minix_write_begin,
404 .write_end = generic_write_end, 403 .write_end = generic_write_end,
405 .bmap = minix_bmap 404 .bmap = minix_bmap
diff --git a/fs/mpage.c b/fs/mpage.c
index d78455a81ec9..0afc809e46e0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -364,6 +364,9 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
364 sector_t last_block_in_bio = 0; 364 sector_t last_block_in_bio = 0;
365 struct buffer_head map_bh; 365 struct buffer_head map_bh;
366 unsigned long first_logical_block = 0; 366 unsigned long first_logical_block = 0;
367 struct blk_plug plug;
368
369 blk_start_plug(&plug);
367 370
368 map_bh.b_state = 0; 371 map_bh.b_state = 0;
369 map_bh.b_size = 0; 372 map_bh.b_size = 0;
@@ -385,6 +388,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
385 BUG_ON(!list_empty(pages)); 388 BUG_ON(!list_empty(pages));
386 if (bio) 389 if (bio)
387 mpage_bio_submit(READ, bio); 390 mpage_bio_submit(READ, bio);
391 blk_finish_plug(&plug);
388 return 0; 392 return 0;
389} 393}
390EXPORT_SYMBOL(mpage_readpages); 394EXPORT_SYMBOL(mpage_readpages);
@@ -666,8 +670,11 @@ int
666mpage_writepages(struct address_space *mapping, 670mpage_writepages(struct address_space *mapping,
667 struct writeback_control *wbc, get_block_t get_block) 671 struct writeback_control *wbc, get_block_t get_block)
668{ 672{
673 struct blk_plug plug;
669 int ret; 674 int ret;
670 675
676 blk_start_plug(&plug);
677
671 if (!get_block) 678 if (!get_block)
672 ret = generic_writepages(mapping, wbc); 679 ret = generic_writepages(mapping, wbc);
673 else { 680 else {
@@ -682,6 +689,7 @@ mpage_writepages(struct address_space *mapping,
682 if (mpd.bio) 689 if (mpd.bio)
683 mpage_bio_submit(WRITE, mpd.bio); 690 mpage_bio_submit(WRITE, mpd.bio);
684 } 691 }
692 blk_finish_plug(&plug);
685 return ret; 693 return ret;
686} 694}
687EXPORT_SYMBOL(mpage_writepages); 695EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 85f7baa15f5d..609cd223eea8 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,15 +34,10 @@
34#include "page.h" 34#include "page.h"
35#include "btnode.h" 35#include "btnode.h"
36 36
37
38static const struct address_space_operations def_btnode_aops = {
39 .sync_page = block_sync_page,
40};
41
42void nilfs_btnode_cache_init(struct address_space *btnc, 37void nilfs_btnode_cache_init(struct address_space *btnc,
43 struct backing_dev_info *bdi) 38 struct backing_dev_info *bdi)
44{ 39{
45 nilfs_mapping_init(btnc, bdi, &def_btnode_aops); 40 nilfs_mapping_init(btnc, bdi);
46} 41}
47 42
48void nilfs_btnode_cache_clear(struct address_space *btnc) 43void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index caf9a6a3fb54..1c2a3e23f8b2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -49,7 +49,6 @@
49#include "ifile.h" 49#include "ifile.h"
50 50
51static const struct address_space_operations def_gcinode_aops = { 51static const struct address_space_operations def_gcinode_aops = {
52 .sync_page = block_sync_page,
53}; 52};
54 53
55/* 54/*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index d5625be236a8..c0aa27490c02 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -280,7 +280,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
280const struct address_space_operations nilfs_aops = { 280const struct address_space_operations nilfs_aops = {
281 .writepage = nilfs_writepage, 281 .writepage = nilfs_writepage,
282 .readpage = nilfs_readpage, 282 .readpage = nilfs_readpage,
283 .sync_page = block_sync_page,
284 .writepages = nilfs_writepages, 283 .writepages = nilfs_writepages,
285 .set_page_dirty = nilfs_set_page_dirty, 284 .set_page_dirty = nilfs_set_page_dirty,
286 .readpages = nilfs_readpages, 285 .readpages = nilfs_readpages,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index a0babd2bff6a..a649b05f7069 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -399,7 +399,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
399 399
400static const struct address_space_operations def_mdt_aops = { 400static const struct address_space_operations def_mdt_aops = {
401 .writepage = nilfs_mdt_write_page, 401 .writepage = nilfs_mdt_write_page,
402 .sync_page = block_sync_page,
403}; 402};
404 403
405static const struct inode_operations def_mdt_iops; 404static const struct inode_operations def_mdt_iops;
@@ -438,10 +437,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
438 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 437 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
439} 438}
440 439
441static const struct address_space_operations shadow_map_aops = {
442 .sync_page = block_sync_page,
443};
444
445/** 440/**
446 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file 441 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
447 * @inode: inode of the metadata file 442 * @inode: inode of the metadata file
@@ -455,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
455 450
456 INIT_LIST_HEAD(&shadow->frozen_buffers); 451 INIT_LIST_HEAD(&shadow->frozen_buffers);
457 address_space_init_once(&shadow->frozen_data); 452 address_space_init_once(&shadow->frozen_data);
458 nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops); 453 nilfs_mapping_init(&shadow->frozen_data, bdi);
459 address_space_init_once(&shadow->frozen_btnodes); 454 address_space_init_once(&shadow->frozen_btnodes);
460 nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops); 455 nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
461 mi->mi_shadow = shadow; 456 mi->mi_shadow = shadow;
462 return 0; 457 return 0;
463} 458}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a585b35fd6bc..4d2a1ee0eb47 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -493,15 +493,14 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
493} 493}
494 494
495void nilfs_mapping_init(struct address_space *mapping, 495void nilfs_mapping_init(struct address_space *mapping,
496 struct backing_dev_info *bdi, 496 struct backing_dev_info *bdi)
497 const struct address_space_operations *aops)
498{ 497{
499 mapping->host = NULL; 498 mapping->host = NULL;
500 mapping->flags = 0; 499 mapping->flags = 0;
501 mapping_set_gfp_mask(mapping, GFP_NOFS); 500 mapping_set_gfp_mask(mapping, GFP_NOFS);
502 mapping->assoc_mapping = NULL; 501 mapping->assoc_mapping = NULL;
503 mapping->backing_dev_info = bdi; 502 mapping->backing_dev_info = bdi;
504 mapping->a_ops = aops; 503 mapping->a_ops = NULL;
505} 504}
506 505
507/* 506/*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 2a00953ebd5f..f06b79ad7493 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -62,8 +62,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
62void nilfs_copy_back_pages(struct address_space *, struct address_space *); 62void nilfs_copy_back_pages(struct address_space *, struct address_space *);
63void nilfs_clear_dirty_pages(struct address_space *); 63void nilfs_clear_dirty_pages(struct address_space *);
64void nilfs_mapping_init(struct address_space *mapping, 64void nilfs_mapping_init(struct address_space *mapping,
65 struct backing_dev_info *bdi, 65 struct backing_dev_info *bdi);
66 const struct address_space_operations *aops);
67unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 66unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
68unsigned long nilfs_find_uncommitted_extent(struct inode *inode, 67unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
69 sector_t start_blk, 68 sector_t start_blk,
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 0f83e93935b2..2853ff20f85a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
509 * Last BIO is always sent through the following 509 * Last BIO is always sent through the following
510 * submission. 510 * submission.
511 */ 511 */
512 rw |= REQ_SYNC | REQ_UNPLUG; 512 rw |= REQ_SYNC;
513 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); 513 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
514 } 514 }
515 515
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c3c2c7ac9020..0b1e885b8cf8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1543,8 +1543,6 @@ err_out:
1543 */ 1543 */
1544const struct address_space_operations ntfs_aops = { 1544const struct address_space_operations ntfs_aops = {
1545 .readpage = ntfs_readpage, /* Fill page with data. */ 1545 .readpage = ntfs_readpage, /* Fill page with data. */
1546 .sync_page = block_sync_page, /* Currently, just unplugs the
1547 disk request queue. */
1548#ifdef NTFS_RW 1546#ifdef NTFS_RW
1549 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1547 .writepage = ntfs_writepage, /* Write dirty page to disk. */
1550#endif /* NTFS_RW */ 1548#endif /* NTFS_RW */
@@ -1560,8 +1558,6 @@ const struct address_space_operations ntfs_aops = {
1560 */ 1558 */
1561const struct address_space_operations ntfs_mst_aops = { 1559const struct address_space_operations ntfs_mst_aops = {
1562 .readpage = ntfs_readpage, /* Fill page with data. */ 1560 .readpage = ntfs_readpage, /* Fill page with data. */
1563 .sync_page = block_sync_page, /* Currently, just unplugs the
1564 disk request queue. */
1565#ifdef NTFS_RW 1561#ifdef NTFS_RW
1566 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1562 .writepage = ntfs_writepage, /* Write dirty page to disk. */
1567 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty 1563 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6551c7cbad92..ef9ed854255c 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -698,8 +698,7 @@ lock_retry_remap:
698 "uptodate! Unplugging the disk queue " 698 "uptodate! Unplugging the disk queue "
699 "and rescheduling."); 699 "and rescheduling.");
700 get_bh(tbh); 700 get_bh(tbh);
701 blk_run_address_space(mapping); 701 io_schedule();
702 schedule();
703 put_bh(tbh); 702 put_bh(tbh);
704 if (unlikely(!buffer_uptodate(tbh))) 703 if (unlikely(!buffer_uptodate(tbh)))
705 goto read_err; 704 goto read_err;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131b..daea0359e974 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2043,7 +2043,6 @@ const struct address_space_operations ocfs2_aops = {
2043 .write_begin = ocfs2_write_begin, 2043 .write_begin = ocfs2_write_begin,
2044 .write_end = ocfs2_write_end, 2044 .write_end = ocfs2_write_end,
2045 .bmap = ocfs2_bmap, 2045 .bmap = ocfs2_bmap,
2046 .sync_page = block_sync_page,
2047 .direct_IO = ocfs2_direct_IO, 2046 .direct_IO = ocfs2_direct_IO,
2048 .invalidatepage = ocfs2_invalidatepage, 2047 .invalidatepage = ocfs2_invalidatepage,
2049 .releasepage = ocfs2_releasepage, 2048 .releasepage = ocfs2_releasepage,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f6..1adab287bd24 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -367,11 +367,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
367static void o2hb_wait_on_io(struct o2hb_region *reg, 367static void o2hb_wait_on_io(struct o2hb_region *reg,
368 struct o2hb_bio_wait_ctxt *wc) 368 struct o2hb_bio_wait_ctxt *wc)
369{ 369{
370 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
371
372 blk_run_address_space(mapping);
373 o2hb_bio_wait_dec(wc, 1); 370 o2hb_bio_wait_dec(wc, 1);
374
375 wait_for_completion(&wc->wc_io_complete); 371 wait_for_completion(&wc->wc_io_complete);
376} 372}
377 373
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 8a6d34fa668a..d738a7e493dd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,6 @@ const struct address_space_operations omfs_aops = {
372 .readpages = omfs_readpages, 372 .readpages = omfs_readpages,
373 .writepage = omfs_writepage, 373 .writepage = omfs_writepage,
374 .writepages = omfs_writepages, 374 .writepages = omfs_writepages,
375 .sync_page = block_sync_page,
376 .write_begin = omfs_write_begin, 375 .write_begin = omfs_write_begin,
377 .write_end = generic_write_end, 376 .write_end = generic_write_end,
378 .bmap = omfs_bmap, 377 .bmap = omfs_bmap,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9c21119512b9..ac546975031f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev,
290{ 290{
291 struct hd_struct *p = dev_to_part(dev); 291 struct hd_struct *p = dev_to_part(dev);
292 292
293 return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); 293 return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
294 atomic_read(&p->in_flight[1]));
294} 295}
295 296
296#ifdef CONFIG_FAIL_MAKE_REQUEST 297#ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e63b4171d583..2b0646613f5a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -335,7 +335,6 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
335static const struct address_space_operations qnx4_aops = { 335static const struct address_space_operations qnx4_aops = {
336 .readpage = qnx4_readpage, 336 .readpage = qnx4_readpage,
337 .writepage = qnx4_writepage, 337 .writepage = qnx4_writepage,
338 .sync_page = block_sync_page,
339 .write_begin = qnx4_write_begin, 338 .write_begin = qnx4_write_begin,
340 .write_end = generic_write_end, 339 .write_end = generic_write_end,
341 .bmap = qnx4_bmap 340 .bmap = qnx4_bmap
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1bba24bad820..4fd5bb33dbb5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3217,7 +3217,6 @@ const struct address_space_operations reiserfs_address_space_operations = {
3217 .readpages = reiserfs_readpages, 3217 .readpages = reiserfs_readpages,
3218 .releasepage = reiserfs_releasepage, 3218 .releasepage = reiserfs_releasepage,
3219 .invalidatepage = reiserfs_invalidatepage, 3219 .invalidatepage = reiserfs_invalidatepage,
3220 .sync_page = block_sync_page,
3221 .write_begin = reiserfs_write_begin, 3220 .write_begin = reiserfs_write_begin,
3222 .write_end = reiserfs_write_end, 3221 .write_end = reiserfs_write_end,
3223 .bmap = reiserfs_aop_bmap, 3222 .bmap = reiserfs_aop_bmap,
diff --git a/fs/super.c b/fs/super.c
index e84864908264..8a06881b1920 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71#else 71#else
72 INIT_LIST_HEAD(&s->s_files); 72 INIT_LIST_HEAD(&s->s_files);
73#endif 73#endif
74 s->s_bdi = &default_backing_dev_info;
74 INIT_LIST_HEAD(&s->s_instances); 75 INIT_LIST_HEAD(&s->s_instances);
75 INIT_HLIST_BL_HEAD(&s->s_anon); 76 INIT_HLIST_BL_HEAD(&s->s_anon);
76 INIT_LIST_HEAD(&s->s_inodes); 77 INIT_LIST_HEAD(&s->s_inodes);
@@ -936,6 +937,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
936 sb = root->d_sb; 937 sb = root->d_sb;
937 BUG_ON(!sb); 938 BUG_ON(!sb);
938 WARN_ON(!sb->s_bdi); 939 WARN_ON(!sb->s_bdi);
940 WARN_ON(sb->s_bdi == &default_backing_dev_info);
939 sb->s_flags |= MS_BORN; 941 sb->s_flags |= MS_BORN;
940 942
941 error = security_sb_kern_mount(sb, flags, secdata); 943 error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index 92ca208777d5..c38ec163da6c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -34,7 +34,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
34 * This should be safe, as we require bdi backing to actually 34 * This should be safe, as we require bdi backing to actually
35 * write out data in the first place 35 * write out data in the first place
36 */ 36 */
37 if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) 37 if (sb->s_bdi == &noop_backing_dev_info)
38 return 0; 38 return 0;
39 39
40 if (sb->s_qcop && sb->s_qcop->quota_sync) 40 if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
80 80
81static void sync_one_sb(struct super_block *sb, void *arg) 81static void sync_one_sb(struct super_block *sb, void *arg)
82{ 82{
83 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) 83 if (!(sb->s_flags & MS_RDONLY))
84 __sync_filesystem(sb, *(int *)arg); 84 __sync_filesystem(sb, *(int *)arg);
85} 85}
86/* 86/*
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 9ca66276315e..fa8d43c92bb8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -488,7 +488,6 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
488const struct address_space_operations sysv_aops = { 488const struct address_space_operations sysv_aops = {
489 .readpage = sysv_readpage, 489 .readpage = sysv_readpage,
490 .writepage = sysv_writepage, 490 .writepage = sysv_writepage,
491 .sync_page = block_sync_page,
492 .write_begin = sysv_write_begin, 491 .write_begin = sysv_write_begin,
493 .write_end = generic_write_end, 492 .write_end = generic_write_end,
494 .bmap = sysv_bmap 493 .bmap = sysv_bmap
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e5dc1e120e8d..6ddd9973e681 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2011,7 +2011,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
2011 */ 2011 */
2012 c->bdi.name = "ubifs", 2012 c->bdi.name = "ubifs",
2013 c->bdi.capabilities = BDI_CAP_MAP_COPY; 2013 c->bdi.capabilities = BDI_CAP_MAP_COPY;
2014 c->bdi.unplug_io_fn = default_unplug_io_fn;
2015 err = bdi_init(&c->bdi); 2014 err = bdi_init(&c->bdi);
2016 if (err) 2015 if (err)
2017 goto out_close; 2016 goto out_close;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index f391a2adc699..2a346bb1d9f5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -98,7 +98,6 @@ static int udf_adinicb_write_end(struct file *file,
98const struct address_space_operations udf_adinicb_aops = { 98const struct address_space_operations udf_adinicb_aops = {
99 .readpage = udf_adinicb_readpage, 99 .readpage = udf_adinicb_readpage,
100 .writepage = udf_adinicb_writepage, 100 .writepage = udf_adinicb_writepage,
101 .sync_page = block_sync_page,
102 .write_begin = simple_write_begin, 101 .write_begin = simple_write_begin,
103 .write_end = udf_adinicb_write_end, 102 .write_end = udf_adinicb_write_end,
104}; 103};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ccc814321414..1d1358ed80c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -140,7 +140,6 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
140const struct address_space_operations udf_aops = { 140const struct address_space_operations udf_aops = {
141 .readpage = udf_readpage, 141 .readpage = udf_readpage,
142 .writepage = udf_writepage, 142 .writepage = udf_writepage,
143 .sync_page = block_sync_page,
144 .write_begin = udf_write_begin, 143 .write_begin = udf_write_begin,
145 .write_end = generic_write_end, 144 .write_end = generic_write_end,
146 .bmap = udf_bmap, 145 .bmap = udf_bmap,
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 03c255f12df5..27a4babe7df0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -552,7 +552,6 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
552const struct address_space_operations ufs_aops = { 552const struct address_space_operations ufs_aops = {
553 .readpage = ufs_readpage, 553 .readpage = ufs_readpage,
554 .writepage = ufs_writepage, 554 .writepage = ufs_writepage,
555 .sync_page = block_sync_page,
556 .write_begin = ufs_write_begin, 555 .write_begin = ufs_write_begin,
557 .write_end = generic_write_end, 556 .write_end = generic_write_end,
558 .bmap = ufs_bmap 557 .bmap = ufs_bmap
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index e56a4f567212..11014302c9ca 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -479,7 +479,7 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
479 break; 479 break;
480 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) 480 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
481 ufs_sync_inode (inode); 481 ufs_sync_inode (inode);
482 blk_run_address_space(inode->i_mapping); 482 blk_flush_plug(current);
483 yield(); 483 yield();
484 } 484 }
485 485
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 8c5c87277456..52dbd14260ba 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -413,8 +413,7 @@ xfs_submit_ioend_bio(
413 if (xfs_ioend_new_eof(ioend)) 413 if (xfs_ioend_new_eof(ioend))
414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); 414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
415 415
416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
417 WRITE_SYNC_PLUG : WRITE, bio);
418} 417}
419 418
420STATIC struct bio * 419STATIC struct bio *
@@ -1495,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
1495 .readpages = xfs_vm_readpages, 1494 .readpages = xfs_vm_readpages,
1496 .writepage = xfs_vm_writepage, 1495 .writepage = xfs_vm_writepage,
1497 .writepages = xfs_vm_writepages, 1496 .writepages = xfs_vm_writepages,
1498 .sync_page = block_sync_page,
1499 .releasepage = xfs_vm_releasepage, 1497 .releasepage = xfs_vm_releasepage,
1500 .invalidatepage = xfs_vm_invalidatepage, 1498 .invalidatepage = xfs_vm_invalidatepage,
1501 .write_begin = xfs_vm_write_begin, 1499 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 5cb230f2cb4f..c05324d3282c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -990,7 +990,7 @@ xfs_buf_lock(
990 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 990 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
991 xfs_log_force(bp->b_target->bt_mount, 0); 991 xfs_log_force(bp->b_target->bt_mount, 0);
992 if (atomic_read(&bp->b_io_remaining)) 992 if (atomic_read(&bp->b_io_remaining))
993 blk_run_address_space(bp->b_target->bt_mapping); 993 blk_flush_plug(current);
994 down(&bp->b_sema); 994 down(&bp->b_sema);
995 XB_SET_OWNER(bp); 995 XB_SET_OWNER(bp);
996 996
@@ -1034,9 +1034,7 @@ xfs_buf_wait_unpin(
1034 set_current_state(TASK_UNINTERRUPTIBLE); 1034 set_current_state(TASK_UNINTERRUPTIBLE);
1035 if (atomic_read(&bp->b_pin_count) == 0) 1035 if (atomic_read(&bp->b_pin_count) == 0)
1036 break; 1036 break;
1037 if (atomic_read(&bp->b_io_remaining)) 1037 io_schedule();
1038 blk_run_address_space(bp->b_target->bt_mapping);
1039 schedule();
1040 } 1038 }
1041 remove_wait_queue(&bp->b_waiters, &wait); 1039 remove_wait_queue(&bp->b_waiters, &wait);
1042 set_current_state(TASK_RUNNING); 1040 set_current_state(TASK_RUNNING);
@@ -1442,7 +1440,7 @@ xfs_buf_iowait(
1442 trace_xfs_buf_iowait(bp, _RET_IP_); 1440 trace_xfs_buf_iowait(bp, _RET_IP_);
1443 1441
1444 if (atomic_read(&bp->b_io_remaining)) 1442 if (atomic_read(&bp->b_io_remaining))
1445 blk_run_address_space(bp->b_target->bt_mapping); 1443 blk_flush_plug(current);
1446 wait_for_completion(&bp->b_iowait); 1444 wait_for_completion(&bp->b_iowait);
1447 1445
1448 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1446 trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1666,7 +1664,6 @@ xfs_mapping_buftarg(
1666 struct inode *inode; 1664 struct inode *inode;
1667 struct address_space *mapping; 1665 struct address_space *mapping;
1668 static const struct address_space_operations mapping_aops = { 1666 static const struct address_space_operations mapping_aops = {
1669 .sync_page = block_sync_page,
1670 .migratepage = fail_migrate_page, 1667 .migratepage = fail_migrate_page,
1671 }; 1668 };
1672 1669
@@ -1947,7 +1944,7 @@ xfsbufd(
1947 count++; 1944 count++;
1948 } 1945 }
1949 if (count) 1946 if (count)
1950 blk_run_address_space(target->bt_mapping); 1947 blk_flush_plug(current);
1951 1948
1952 } while (!kthread_should_stop()); 1949 } while (!kthread_should_stop());
1953 1950
@@ -1995,7 +1992,7 @@ xfs_flush_buftarg(
1995 1992
1996 if (wait) { 1993 if (wait) {
1997 /* Expedite and wait for IO to complete. */ 1994 /* Expedite and wait for IO to complete. */
1998 blk_run_address_space(target->bt_mapping); 1995 blk_flush_plug(current);
1999 while (!list_empty(&wait_list)) { 1996 while (!list_empty(&wait_list)) {
2000 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1997 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2001 1998
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 4ce34fa937d4..96f4094b706d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -66,8 +66,6 @@ struct backing_dev_info {
66 unsigned int capabilities; /* Device capabilities */ 66 unsigned int capabilities; /* Device capabilities */
67 congested_fn *congested_fn; /* Function pointer if device is md/dm */ 67 congested_fn *congested_fn; /* Function pointer if device is md/dm */
68 void *congested_data; /* Pointer to aux data for congested func */ 68 void *congested_data; /* Pointer to aux data for congested func */
69 void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
70 void *unplug_io_data;
71 69
72 char *name; 70 char *name;
73 71
@@ -251,7 +249,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
251 249
252extern struct backing_dev_info default_backing_dev_info; 250extern struct backing_dev_info default_backing_dev_info;
253extern struct backing_dev_info noop_backing_dev_info; 251extern struct backing_dev_info noop_backing_dev_info;
254void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
255 252
256int writeback_in_progress(struct backing_dev_info *bdi); 253int writeback_in_progress(struct backing_dev_info *bdi);
257 254
@@ -336,17 +333,4 @@ static inline int bdi_sched_wait(void *word)
336 return 0; 333 return 0;
337} 334}
338 335
339static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
340 struct page *page)
341{
342 if (bdi && bdi->unplug_io_fn)
343 bdi->unplug_io_fn(bdi, page);
344}
345
346static inline void blk_run_address_space(struct address_space *mapping)
347{
348 if (mapping)
349 blk_run_backing_dev(mapping->backing_dev_info, NULL);
350}
351
352#endif /* _LINUX_BACKING_DEV_H */ 336#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 35dcdb3589bc..ce33e6868a2f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -304,7 +304,6 @@ struct biovec_slab {
304}; 304};
305 305
306extern struct bio_set *fs_bio_set; 306extern struct bio_set *fs_bio_set;
307extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
308 307
309/* 308/*
310 * a small number of entries is fine, not going to be performance critical. 309 * a small number of entries is fine, not going to be performance critical.
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 46ad5197537a..be50d9e70a7d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -128,7 +128,6 @@ enum rq_flag_bits {
128 __REQ_NOIDLE, /* don't anticipate more IO after this one */ 128 __REQ_NOIDLE, /* don't anticipate more IO after this one */
129 129
130 /* bio only flags */ 130 /* bio only flags */
131 __REQ_UNPLUG, /* unplug the immediately after submission */
132 __REQ_RAHEAD, /* read ahead, can fail anytime */ 131 __REQ_RAHEAD, /* read ahead, can fail anytime */
133 __REQ_THROTTLED, /* This bio has already been subjected to 132 __REQ_THROTTLED, /* This bio has already been subjected to
134 * throttling rules. Don't do it again. */ 133 * throttling rules. Don't do it again. */
@@ -148,9 +147,11 @@ enum rq_flag_bits {
148 __REQ_ALLOCED, /* request came from our alloc pool */ 147 __REQ_ALLOCED, /* request came from our alloc pool */
149 __REQ_COPY_USER, /* contains copies of user pages */ 148 __REQ_COPY_USER, /* contains copies of user pages */
150 __REQ_FLUSH, /* request for cache flush */ 149 __REQ_FLUSH, /* request for cache flush */
150 __REQ_FLUSH_SEQ, /* request for flush sequence */
151 __REQ_IO_STAT, /* account I/O stat */ 151 __REQ_IO_STAT, /* account I/O stat */
152 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 152 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
153 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ 153 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */
154 __REQ_ON_PLUG, /* on plug list */
154 __REQ_NR_BITS, /* stops here */ 155 __REQ_NR_BITS, /* stops here */
155}; 156};
156 157
@@ -170,7 +171,6 @@ enum rq_flag_bits {
170 REQ_NOIDLE | REQ_FLUSH | REQ_FUA) 171 REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
171#define REQ_CLONE_MASK REQ_COMMON_MASK 172#define REQ_CLONE_MASK REQ_COMMON_MASK
172 173
173#define REQ_UNPLUG (1 << __REQ_UNPLUG)
174#define REQ_RAHEAD (1 << __REQ_RAHEAD) 174#define REQ_RAHEAD (1 << __REQ_RAHEAD)
175#define REQ_THROTTLED (1 << __REQ_THROTTLED) 175#define REQ_THROTTLED (1 << __REQ_THROTTLED)
176 176
@@ -188,8 +188,10 @@ enum rq_flag_bits {
188#define REQ_ALLOCED (1 << __REQ_ALLOCED) 188#define REQ_ALLOCED (1 << __REQ_ALLOCED)
189#define REQ_COPY_USER (1 << __REQ_COPY_USER) 189#define REQ_COPY_USER (1 << __REQ_COPY_USER)
190#define REQ_FLUSH (1 << __REQ_FLUSH) 190#define REQ_FLUSH (1 << __REQ_FLUSH)
191#define REQ_FLUSH_SEQ (1 << __REQ_FLUSH_SEQ)
191#define REQ_IO_STAT (1 << __REQ_IO_STAT) 192#define REQ_IO_STAT (1 << __REQ_IO_STAT)
192#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 193#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
193#define REQ_SECURE (1 << __REQ_SECURE) 194#define REQ_SECURE (1 << __REQ_SECURE)
195#define REQ_ON_PLUG (1 << __REQ_ON_PLUG)
194 196
195#endif /* __LINUX_BLK_TYPES_H */ 197#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d5063e1b5555..16a902f099ac 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -108,11 +108,17 @@ struct request {
108 108
109 /* 109 /*
110 * Three pointers are available for the IO schedulers, if they need 110 * Three pointers are available for the IO schedulers, if they need
111 * more they have to dynamically allocate it. 111 * more they have to dynamically allocate it. Flush requests are
112 * never put on the IO scheduler. So let the flush fields share
113 * space with the three elevator_private pointers.
112 */ 114 */
113 void *elevator_private; 115 union {
114 void *elevator_private2; 116 void *elevator_private[3];
115 void *elevator_private3; 117 struct {
118 unsigned int seq;
119 struct list_head list;
120 } flush;
121 };
116 122
117 struct gendisk *rq_disk; 123 struct gendisk *rq_disk;
118 struct hd_struct *part; 124 struct hd_struct *part;
@@ -190,7 +196,6 @@ typedef void (request_fn_proc) (struct request_queue *q);
190typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); 196typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
191typedef int (prep_rq_fn) (struct request_queue *, struct request *); 197typedef int (prep_rq_fn) (struct request_queue *, struct request *);
192typedef void (unprep_rq_fn) (struct request_queue *, struct request *); 198typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
193typedef void (unplug_fn) (struct request_queue *);
194 199
195struct bio_vec; 200struct bio_vec;
196struct bvec_merge_data { 201struct bvec_merge_data {
@@ -273,7 +278,6 @@ struct request_queue
273 make_request_fn *make_request_fn; 278 make_request_fn *make_request_fn;
274 prep_rq_fn *prep_rq_fn; 279 prep_rq_fn *prep_rq_fn;
275 unprep_rq_fn *unprep_rq_fn; 280 unprep_rq_fn *unprep_rq_fn;
276 unplug_fn *unplug_fn;
277 merge_bvec_fn *merge_bvec_fn; 281 merge_bvec_fn *merge_bvec_fn;
278 softirq_done_fn *softirq_done_fn; 282 softirq_done_fn *softirq_done_fn;
279 rq_timed_out_fn *rq_timed_out_fn; 283 rq_timed_out_fn *rq_timed_out_fn;
@@ -287,12 +291,9 @@ struct request_queue
287 struct request *boundary_rq; 291 struct request *boundary_rq;
288 292
289 /* 293 /*
290 * Auto-unplugging state 294 * Delayed queue handling
291 */ 295 */
292 struct timer_list unplug_timer; 296 struct delayed_work delay_work;
293 int unplug_thresh; /* After this many requests */
294 unsigned long unplug_delay; /* After this many jiffies */
295 struct work_struct unplug_work;
296 297
297 struct backing_dev_info backing_dev_info; 298 struct backing_dev_info backing_dev_info;
298 299
@@ -363,11 +364,12 @@ struct request_queue
363 * for flush operations 364 * for flush operations
364 */ 365 */
365 unsigned int flush_flags; 366 unsigned int flush_flags;
366 unsigned int flush_seq; 367 unsigned int flush_pending_idx:1;
367 int flush_err; 368 unsigned int flush_running_idx:1;
369 unsigned long flush_pending_since;
370 struct list_head flush_queue[2];
371 struct list_head flush_data_in_flight;
368 struct request flush_rq; 372 struct request flush_rq;
369 struct request *orig_flush_rq;
370 struct list_head pending_flushes;
371 373
372 struct mutex sysfs_lock; 374 struct mutex sysfs_lock;
373 375
@@ -387,14 +389,13 @@ struct request_queue
387#define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ 389#define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */
388#define QUEUE_FLAG_DEAD 5 /* queue being torn down */ 390#define QUEUE_FLAG_DEAD 5 /* queue being torn down */
389#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ 391#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */
390#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ 392#define QUEUE_FLAG_ELVSWITCH 7 /* don't use elevator, just do FIFO */
391#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ 393#define QUEUE_FLAG_BIDI 8 /* queue supports bidi requests */
392#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ 394#define QUEUE_FLAG_NOMERGES 9 /* disable merge attempts */
393#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ 395#define QUEUE_FLAG_SAME_COMP 10 /* force complete on same CPU */
394#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ 396#define QUEUE_FLAG_FAIL_IO 11 /* fake timeout */
395#define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ 397#define QUEUE_FLAG_STACKABLE 12 /* supports request stacking */
396#define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ 398#define QUEUE_FLAG_NONROT 13 /* non-rotational device (SSD) */
397#define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */
398#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ 399#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */
399#define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ 400#define QUEUE_FLAG_IO_STAT 15 /* do IO stats */
400#define QUEUE_FLAG_DISCARD 16 /* supports DISCARD */ 401#define QUEUE_FLAG_DISCARD 16 /* supports DISCARD */
@@ -472,7 +473,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
472 __clear_bit(flag, &q->queue_flags); 473 __clear_bit(flag, &q->queue_flags);
473} 474}
474 475
475#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
476#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) 476#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
477#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) 477#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
478#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 478#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
@@ -667,9 +667,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
667extern void blk_rq_unprep_clone(struct request *rq); 667extern void blk_rq_unprep_clone(struct request *rq);
668extern int blk_insert_cloned_request(struct request_queue *q, 668extern int blk_insert_cloned_request(struct request_queue *q,
669 struct request *rq); 669 struct request *rq);
670extern void blk_plug_device(struct request_queue *); 670extern void blk_delay_queue(struct request_queue *, unsigned long);
671extern void blk_plug_device_unlocked(struct request_queue *);
672extern int blk_remove_plug(struct request_queue *);
673extern void blk_recount_segments(struct request_queue *, struct bio *); 671extern void blk_recount_segments(struct request_queue *, struct bio *);
674extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, 672extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
675 unsigned int, void __user *); 673 unsigned int, void __user *);
@@ -713,7 +711,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
713 struct request *, int); 711 struct request *, int);
714extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, 712extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
715 struct request *, int, rq_end_io_fn *); 713 struct request *, int, rq_end_io_fn *);
716extern void blk_unplug(struct request_queue *q);
717 714
718static inline struct request_queue *bdev_get_queue(struct block_device *bdev) 715static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
719{ 716{
@@ -850,7 +847,6 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
850 847
851extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); 848extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
852extern void blk_dump_rq_flags(struct request *, char *); 849extern void blk_dump_rq_flags(struct request *, char *);
853extern void generic_unplug_device(struct request_queue *);
854extern long nr_blockdev_pages(void); 850extern long nr_blockdev_pages(void);
855 851
856int blk_get_queue(struct request_queue *); 852int blk_get_queue(struct request_queue *);
@@ -858,6 +854,31 @@ struct request_queue *blk_alloc_queue(gfp_t);
858struct request_queue *blk_alloc_queue_node(gfp_t, int); 854struct request_queue *blk_alloc_queue_node(gfp_t, int);
859extern void blk_put_queue(struct request_queue *); 855extern void blk_put_queue(struct request_queue *);
860 856
857struct blk_plug {
858 unsigned long magic;
859 struct list_head list;
860 unsigned int should_sort;
861};
862
863extern void blk_start_plug(struct blk_plug *);
864extern void blk_finish_plug(struct blk_plug *);
865extern void __blk_flush_plug(struct task_struct *, struct blk_plug *);
866
867static inline void blk_flush_plug(struct task_struct *tsk)
868{
869 struct blk_plug *plug = tsk->plug;
870
871 if (unlikely(plug))
872 __blk_flush_plug(tsk, plug);
873}
874
875static inline bool blk_needs_flush_plug(struct task_struct *tsk)
876{
877 struct blk_plug *plug = tsk->plug;
878
879 return plug && !list_empty(&plug->list);
880}
881
861/* 882/*
862 * tag stuff 883 * tag stuff
863 */ 884 */
@@ -1135,7 +1156,6 @@ static inline uint64_t rq_io_start_time_ns(struct request *req)
1135extern int blk_throtl_init(struct request_queue *q); 1156extern int blk_throtl_init(struct request_queue *q);
1136extern void blk_throtl_exit(struct request_queue *q); 1157extern void blk_throtl_exit(struct request_queue *q);
1137extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); 1158extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
1138extern void throtl_shutdown_timer_wq(struct request_queue *q);
1139#else /* CONFIG_BLK_DEV_THROTTLING */ 1159#else /* CONFIG_BLK_DEV_THROTTLING */
1140static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) 1160static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
1141{ 1161{
@@ -1144,7 +1164,6 @@ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
1144 1164
1145static inline int blk_throtl_init(struct request_queue *q) { return 0; } 1165static inline int blk_throtl_init(struct request_queue *q) { return 0; }
1146static inline int blk_throtl_exit(struct request_queue *q) { return 0; } 1166static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
1147static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
1148#endif /* CONFIG_BLK_DEV_THROTTLING */ 1167#endif /* CONFIG_BLK_DEV_THROTTLING */
1149 1168
1150#define MODULE_ALIAS_BLOCKDEV(major,minor) \ 1169#define MODULE_ALIAS_BLOCKDEV(major,minor) \
@@ -1278,6 +1297,26 @@ static inline long nr_blockdev_pages(void)
1278 return 0; 1297 return 0;
1279} 1298}
1280 1299
1300struct blk_plug {
1301};
1302
1303static inline void blk_start_plug(struct blk_plug *plug)
1304{
1305}
1306
1307static inline void blk_finish_plug(struct blk_plug *plug)
1308{
1309}
1310
1311static inline void blk_flush_plug(struct task_struct *task)
1312{
1313}
1314
1315static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1316{
1317 return false;
1318}
1319
1281#endif /* CONFIG_BLOCK */ 1320#endif /* CONFIG_BLOCK */
1282 1321
1283#endif 1322#endif
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 68d1fe7b877c..f5df23561b96 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -219,7 +219,6 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size);
219int block_commit_write(struct page *page, unsigned from, unsigned to); 219int block_commit_write(struct page *page, unsigned from, unsigned to);
220int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 220int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
221 get_block_t get_block); 221 get_block_t get_block);
222void block_sync_page(struct page *);
223sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); 222sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
224int block_truncate_page(struct address_space *, loff_t, get_block_t *); 223int block_truncate_page(struct address_space *, loff_t, get_block_t *);
225int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned, 224int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 272496d1fae4..e2768834f397 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -286,11 +286,6 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
286int dm_table_complete(struct dm_table *t); 286int dm_table_complete(struct dm_table *t);
287 287
288/* 288/*
289 * Unplug all devices in a table.
290 */
291void dm_table_unplug_all(struct dm_table *t);
292
293/*
294 * Table reference counting. 289 * Table reference counting.
295 */ 290 */
296struct dm_table *dm_get_live_table(struct mapped_device *md); 291struct dm_table *dm_get_live_table(struct mapped_device *md);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4d857973d2c9..d93efcc44570 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -20,7 +20,6 @@ typedef void (elevator_bio_merged_fn) (struct request_queue *,
20typedef int (elevator_dispatch_fn) (struct request_queue *, int); 20typedef int (elevator_dispatch_fn) (struct request_queue *, int);
21 21
22typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); 22typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
23typedef int (elevator_queue_empty_fn) (struct request_queue *);
24typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); 23typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
25typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); 24typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
26typedef int (elevator_may_queue_fn) (struct request_queue *, int); 25typedef int (elevator_may_queue_fn) (struct request_queue *, int);
@@ -46,7 +45,6 @@ struct elevator_ops
46 elevator_activate_req_fn *elevator_activate_req_fn; 45 elevator_activate_req_fn *elevator_activate_req_fn;
47 elevator_deactivate_req_fn *elevator_deactivate_req_fn; 46 elevator_deactivate_req_fn *elevator_deactivate_req_fn;
48 47
49 elevator_queue_empty_fn *elevator_queue_empty_fn;
50 elevator_completed_req_fn *elevator_completed_req_fn; 48 elevator_completed_req_fn *elevator_completed_req_fn;
51 49
52 elevator_request_list_fn *elevator_former_req_fn; 50 elevator_request_list_fn *elevator_former_req_fn;
@@ -101,17 +99,17 @@ struct elevator_queue
101 */ 99 */
102extern void elv_dispatch_sort(struct request_queue *, struct request *); 100extern void elv_dispatch_sort(struct request_queue *, struct request *);
103extern void elv_dispatch_add_tail(struct request_queue *, struct request *); 101extern void elv_dispatch_add_tail(struct request_queue *, struct request *);
104extern void elv_add_request(struct request_queue *, struct request *, int, int); 102extern void elv_add_request(struct request_queue *, struct request *, int);
105extern void __elv_add_request(struct request_queue *, struct request *, int, int); 103extern void __elv_add_request(struct request_queue *, struct request *, int);
106extern void elv_insert(struct request_queue *, struct request *, int); 104extern void elv_insert(struct request_queue *, struct request *, int);
107extern int elv_merge(struct request_queue *, struct request **, struct bio *); 105extern int elv_merge(struct request_queue *, struct request **, struct bio *);
106extern int elv_try_merge(struct request *, struct bio *);
108extern void elv_merge_requests(struct request_queue *, struct request *, 107extern void elv_merge_requests(struct request_queue *, struct request *,
109 struct request *); 108 struct request *);
110extern void elv_merged_request(struct request_queue *, struct request *, int); 109extern void elv_merged_request(struct request_queue *, struct request *, int);
111extern void elv_bio_merged(struct request_queue *q, struct request *, 110extern void elv_bio_merged(struct request_queue *q, struct request *,
112 struct bio *); 111 struct bio *);
113extern void elv_requeue_request(struct request_queue *, struct request *); 112extern void elv_requeue_request(struct request_queue *, struct request *);
114extern int elv_queue_empty(struct request_queue *);
115extern struct request *elv_former_request(struct request_queue *, struct request *); 113extern struct request *elv_former_request(struct request_queue *, struct request *);
116extern struct request *elv_latter_request(struct request_queue *, struct request *); 114extern struct request *elv_latter_request(struct request_queue *, struct request *);
117extern int elv_register_queue(struct request_queue *q); 115extern int elv_register_queue(struct request_queue *q);
@@ -167,6 +165,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
167#define ELEVATOR_INSERT_BACK 2 165#define ELEVATOR_INSERT_BACK 2
168#define ELEVATOR_INSERT_SORT 3 166#define ELEVATOR_INSERT_SORT 3
169#define ELEVATOR_INSERT_REQUEUE 4 167#define ELEVATOR_INSERT_REQUEUE 4
168#define ELEVATOR_INSERT_FLUSH 5
169#define ELEVATOR_INSERT_SORT_MERGE 6
170 170
171/* 171/*
172 * return values from elevator_may_queue_fn 172 * return values from elevator_may_queue_fn
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4dda076c24a1..ce7e18555197 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -138,16 +138,10 @@ struct inodes_stat_t {
138 * block layer could (in theory) choose to ignore this 138 * block layer could (in theory) choose to ignore this
139 * request if it runs into resource problems. 139 * request if it runs into resource problems.
140 * WRITE A normal async write. Device will be plugged. 140 * WRITE A normal async write. Device will be plugged.
141 * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down 141 * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down
142 * the hint that someone will be waiting on this IO 142 * the hint that someone will be waiting on this IO
143 * shortly. The device must still be unplugged explicitly, 143 * shortly. The write equivalent of READ_SYNC.
144 * WRITE_SYNC_PLUG does not do this as we could be 144 * WRITE_ODIRECT Special case write for O_DIRECT only.
145 * submitting more writes before we actually wait on any
146 * of them.
147 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device
148 * immediately after submission. The write equivalent
149 * of READ_SYNC.
150 * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
151 * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. 145 * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush.
152 * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on 146 * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on
153 * non-volatile media on completion. 147 * non-volatile media on completion.
@@ -163,18 +157,14 @@ struct inodes_stat_t {
163#define WRITE RW_MASK 157#define WRITE RW_MASK
164#define READA RWA_MASK 158#define READA RWA_MASK
165 159
166#define READ_SYNC (READ | REQ_SYNC | REQ_UNPLUG) 160#define READ_SYNC (READ | REQ_SYNC)
167#define READ_META (READ | REQ_META) 161#define READ_META (READ | REQ_META)
168#define WRITE_SYNC_PLUG (WRITE | REQ_SYNC | REQ_NOIDLE) 162#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)
169#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) 163#define WRITE_ODIRECT (WRITE | REQ_SYNC)
170#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC)
171#define WRITE_META (WRITE | REQ_META) 164#define WRITE_META (WRITE | REQ_META)
172#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ 165#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
173 REQ_FLUSH) 166#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
174#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ 167#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
175 REQ_FUA)
176#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
177 REQ_FLUSH | REQ_FUA)
178 168
179#define SEL_IN 1 169#define SEL_IN 1
180#define SEL_OUT 2 170#define SEL_OUT 2
@@ -586,7 +576,6 @@ typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
586struct address_space_operations { 576struct address_space_operations {
587 int (*writepage)(struct page *page, struct writeback_control *wbc); 577 int (*writepage)(struct page *page, struct writeback_control *wbc);
588 int (*readpage)(struct file *, struct page *); 578 int (*readpage)(struct file *, struct page *);
589 void (*sync_page)(struct page *);
590 579
591 /* Write back some dirty pages from this mapping. */ 580 /* Write back some dirty pages from this mapping. */
592 int (*writepages)(struct address_space *, struct writeback_control *); 581 int (*writepages)(struct address_space *, struct writeback_control *);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c0d5f6945c1e..d764a426e9fd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -109,7 +109,7 @@ struct hd_struct {
109 int make_it_fail; 109 int make_it_fail;
110#endif 110#endif
111 unsigned long stamp; 111 unsigned long stamp;
112 int in_flight[2]; 112 atomic_t in_flight[2];
113#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
114 struct disk_stats __percpu *dkstats; 114 struct disk_stats __percpu *dkstats;
115#else 115#else
@@ -370,21 +370,21 @@ static inline void free_part_stats(struct hd_struct *part)
370 370
371static inline void part_inc_in_flight(struct hd_struct *part, int rw) 371static inline void part_inc_in_flight(struct hd_struct *part, int rw)
372{ 372{
373 part->in_flight[rw]++; 373 atomic_inc(&part->in_flight[rw]);
374 if (part->partno) 374 if (part->partno)
375 part_to_disk(part)->part0.in_flight[rw]++; 375 atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
376} 376}
377 377
378static inline void part_dec_in_flight(struct hd_struct *part, int rw) 378static inline void part_dec_in_flight(struct hd_struct *part, int rw)
379{ 379{
380 part->in_flight[rw]--; 380 atomic_dec(&part->in_flight[rw]);
381 if (part->partno) 381 if (part->partno)
382 part_to_disk(part)->part0.in_flight[rw]--; 382 atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
383} 383}
384 384
385static inline int part_in_flight(struct hd_struct *part) 385static inline int part_in_flight(struct hd_struct *part)
386{ 386{
387 return part->in_flight[0] + part->in_flight[1]; 387 return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
388} 388}
389 389
390static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) 390static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 29ebba54c238..c11950652646 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -298,7 +298,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
298 298
299extern void __lock_page(struct page *page); 299extern void __lock_page(struct page *page);
300extern int __lock_page_killable(struct page *page); 300extern int __lock_page_killable(struct page *page);
301extern void __lock_page_nosync(struct page *page);
302extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 301extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
303 unsigned int flags); 302 unsigned int flags);
304extern void unlock_page(struct page *page); 303extern void unlock_page(struct page *page);
@@ -342,17 +341,6 @@ static inline int lock_page_killable(struct page *page)
342} 341}
343 342
344/* 343/*
345 * lock_page_nosync should only be used if we can't pin the page's inode.
346 * Doesn't play quite so well with block device plugging.
347 */
348static inline void lock_page_nosync(struct page *page)
349{
350 might_sleep();
351 if (!trylock_page(page))
352 __lock_page_nosync(page);
353}
354
355/*
356 * lock_page_or_retry - Lock the page, unless this would block and the 344 * lock_page_or_retry - Lock the page, unless this would block and the
357 * caller indicated that it can handle a retry. 345 * caller indicated that it can handle a retry.
358 */ 346 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 98fc7ed4b191..b8369d522bf8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,6 +99,7 @@ struct robust_list_head;
99struct bio_list; 99struct bio_list;
100struct fs_struct; 100struct fs_struct;
101struct perf_event_context; 101struct perf_event_context;
102struct blk_plug;
102 103
103/* 104/*
104 * List of flags we want to share for kernel threads, 105 * List of flags we want to share for kernel threads,
@@ -1428,6 +1429,11 @@ struct task_struct {
1428/* stacked block device info */ 1429/* stacked block device info */
1429 struct bio_list *bio_list; 1430 struct bio_list *bio_list;
1430 1431
1432#ifdef CONFIG_BLOCK
1433/* stack plugging */
1434 struct blk_plug *plug;
1435#endif
1436
1431/* VM state */ 1437/* VM state */
1432 struct reclaim_state *reclaim_state; 1438 struct reclaim_state *reclaim_state;
1433 1439
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ed6ebe690f4a..a5c6da5d8df8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -309,8 +309,6 @@ extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
309 struct page **pagep, swp_entry_t *ent); 309 struct page **pagep, swp_entry_t *ent);
310#endif 310#endif
311 311
312extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
313
314#ifdef CONFIG_SWAP 312#ifdef CONFIG_SWAP
315/* linux/mm/page_io.c */ 313/* linux/mm/page_io.c */
316extern int swap_readpage(struct page *); 314extern int swap_readpage(struct page *);
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..6a488ad2dce5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
908 profile_task_exit(tsk); 908 profile_task_exit(tsk);
909 909
910 WARN_ON(atomic_read(&tsk->fs_excl)); 910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk));
911 912
912 if (unlikely(in_interrupt())) 913 if (unlikely(in_interrupt()))
913 panic("Aiee, killing interrupt handler!"); 914 panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 457fff2e17e0..e7548dee636b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1205,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1205 * Clear TID on mm_release()? 1205 * Clear TID on mm_release()?
1206 */ 1206 */
1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1208#ifdef CONFIG_BLOCK
1209 p->plug = NULL;
1210#endif
1208#ifdef CONFIG_FUTEX 1211#ifdef CONFIG_FUTEX
1209 p->robust_list = NULL; 1212 p->robust_list = NULL;
1210#ifdef CONFIG_COMPAT 1213#ifdef CONFIG_COMPAT
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; 31 const int bio_rw = rw | REQ_SYNC;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/sched.c b/kernel/sched.c
index 480adeb63f8f..ae659b99ce73 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4115,6 +4115,16 @@ need_resched:
4115 switch_count = &prev->nvcsw; 4115 switch_count = &prev->nvcsw;
4116 } 4116 }
4117 4117
4118 /*
4119 * If we are going to sleep and we have plugged IO queued, make
4120 * sure to submit it to avoid deadlocks.
4121 */
4122 if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
4123 raw_spin_unlock(&rq->lock);
4124 blk_flush_plug(prev);
4125 raw_spin_lock(&rq->lock);
4126 }
4127
4118 pre_schedule(rq, prev); 4128 pre_schedule(rq, prev);
4119 4129
4120 if (unlikely(!rq->nr_running)) 4130 if (unlikely(!rq->nr_running))
@@ -5528,6 +5538,7 @@ void __sched io_schedule(void)
5528 5538
5529 delayacct_blkio_start(); 5539 delayacct_blkio_start();
5530 atomic_inc(&rq->nr_iowait); 5540 atomic_inc(&rq->nr_iowait);
5541 blk_flush_plug(current);
5531 current->in_iowait = 1; 5542 current->in_iowait = 1;
5532 schedule(); 5543 schedule();
5533 current->in_iowait = 0; 5544 current->in_iowait = 0;
@@ -5543,6 +5554,7 @@ long __sched io_schedule_timeout(long timeout)
5543 5554
5544 delayacct_blkio_start(); 5555 delayacct_blkio_start();
5545 atomic_inc(&rq->nr_iowait); 5556 atomic_inc(&rq->nr_iowait);
5557 blk_flush_plug(current);
5546 current->in_iowait = 1; 5558 current->in_iowait = 1;
5547 ret = schedule_timeout(timeout); 5559 ret = schedule_timeout(timeout);
5548 current->in_iowait = 0; 5560 current->in_iowait = 0;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..7aa40f8e182d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
703 * 703 *
704 **/ 704 **/
705static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 705static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
706 u32 what) 706 u32 what)
707{ 707{
708 struct blk_trace *bt = q->blk_trace; 708 struct blk_trace *bt = q->blk_trace;
709 int rw = rq->cmd_flags & 0x03;
710 709
711 if (likely(!bt)) 710 if (likely(!bt))
712 return; 711 return;
713 712
714 if (rq->cmd_flags & REQ_DISCARD)
715 rw |= REQ_DISCARD;
716
717 if (rq->cmd_flags & REQ_SECURE)
718 rw |= REQ_SECURE;
719
720 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
721 what |= BLK_TC_ACT(BLK_TC_PC); 714 what |= BLK_TC_ACT(BLK_TC_PC);
722 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
723 what, rq->errors, rq->cmd_len, rq->cmd); 716 what, rq->errors, rq->cmd_len, rq->cmd);
724 } else { 717 } else {
725 what |= BLK_TC_ACT(BLK_TC_FS); 718 what |= BLK_TC_ACT(BLK_TC_FS);
726 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, 719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
727 what, rq->errors, 0, NULL); 720 rq->cmd_flags, what, rq->errors, 0, NULL);
728 } 721 }
729} 722}
730 723
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d30227..8fe9d3407921 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
14 14
15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16 16
17void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
18{
19}
20EXPORT_SYMBOL(default_unplug_io_fn);
21
22struct backing_dev_info default_backing_dev_info = { 17struct backing_dev_info default_backing_dev_info = {
23 .name = "default", 18 .name = "default",
24 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 19 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
25 .state = 0, 20 .state = 0,
26 .capabilities = BDI_CAP_MAP_COPY, 21 .capabilities = BDI_CAP_MAP_COPY,
27 .unplug_io_fn = default_unplug_io_fn,
28}; 22};
29EXPORT_SYMBOL_GPL(default_backing_dev_info); 23EXPORT_SYMBOL_GPL(default_backing_dev_info);
30 24
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
604 spin_lock(&sb_lock); 598 spin_lock(&sb_lock);
605 list_for_each_entry(sb, &super_blocks, s_list) { 599 list_for_each_entry(sb, &super_blocks, s_list) {
606 if (sb->s_bdi == bdi) 600 if (sb->s_bdi == bdi)
607 sb->s_bdi = NULL; 601 sb->s_bdi = &default_backing_dev_info;
608 } 602 }
609 spin_unlock(&sb_lock); 603 spin_unlock(&sb_lock);
610} 604}
diff --git a/mm/filemap.c b/mm/filemap.c
index f807afda86f2..04d1992fd86b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -164,45 +164,15 @@ void delete_from_page_cache(struct page *page)
164} 164}
165EXPORT_SYMBOL(delete_from_page_cache); 165EXPORT_SYMBOL(delete_from_page_cache);
166 166
167static int sync_page(void *word) 167static int sleep_on_page(void *word)
168{ 168{
169 struct address_space *mapping;
170 struct page *page;
171
172 page = container_of((unsigned long *)word, struct page, flags);
173
174 /*
175 * page_mapping() is being called without PG_locked held.
176 * Some knowledge of the state and use of the page is used to
177 * reduce the requirements down to a memory barrier.
178 * The danger here is of a stale page_mapping() return value
179 * indicating a struct address_space different from the one it's
180 * associated with when it is associated with one.
181 * After smp_mb(), it's either the correct page_mapping() for
182 * the page, or an old page_mapping() and the page's own
183 * page_mapping() has gone NULL.
184 * The ->sync_page() address_space operation must tolerate
185 * page_mapping() going NULL. By an amazing coincidence,
186 * this comes about because none of the users of the page
187 * in the ->sync_page() methods make essential use of the
188 * page_mapping(), merely passing the page down to the backing
189 * device's unplug functions when it's non-NULL, which in turn
190 * ignore it for all cases but swap, where only page_private(page) is
191 * of interest. When page_mapping() does go NULL, the entire
192 * call stack gracefully ignores the page and returns.
193 * -- wli
194 */
195 smp_mb();
196 mapping = page_mapping(page);
197 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
198 mapping->a_ops->sync_page(page);
199 io_schedule(); 169 io_schedule();
200 return 0; 170 return 0;
201} 171}
202 172
203static int sync_page_killable(void *word) 173static int sleep_on_page_killable(void *word)
204{ 174{
205 sync_page(word); 175 sleep_on_page(word);
206 return fatal_signal_pending(current) ? -EINTR : 0; 176 return fatal_signal_pending(current) ? -EINTR : 0;
207} 177}
208 178
@@ -558,12 +528,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
558EXPORT_SYMBOL(__page_cache_alloc); 528EXPORT_SYMBOL(__page_cache_alloc);
559#endif 529#endif
560 530
561static int __sleep_on_page_lock(void *word)
562{
563 io_schedule();
564 return 0;
565}
566
567/* 531/*
568 * In order to wait for pages to become available there must be 532 * In order to wait for pages to become available there must be
569 * waitqueues associated with pages. By using a hash table of 533 * waitqueues associated with pages. By using a hash table of
@@ -591,7 +555,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
591 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 555 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
592 556
593 if (test_bit(bit_nr, &page->flags)) 557 if (test_bit(bit_nr, &page->flags))
594 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 558 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
595 TASK_UNINTERRUPTIBLE); 559 TASK_UNINTERRUPTIBLE);
596} 560}
597EXPORT_SYMBOL(wait_on_page_bit); 561EXPORT_SYMBOL(wait_on_page_bit);
@@ -655,17 +619,12 @@ EXPORT_SYMBOL(end_page_writeback);
655/** 619/**
656 * __lock_page - get a lock on the page, assuming we need to sleep to get it 620 * __lock_page - get a lock on the page, assuming we need to sleep to get it
657 * @page: the page to lock 621 * @page: the page to lock
658 *
659 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
660 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
661 * chances are that on the second loop, the block layer's plug list is empty,
662 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
663 */ 622 */
664void __lock_page(struct page *page) 623void __lock_page(struct page *page)
665{ 624{
666 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 625 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
667 626
668 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 627 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
669 TASK_UNINTERRUPTIBLE); 628 TASK_UNINTERRUPTIBLE);
670} 629}
671EXPORT_SYMBOL(__lock_page); 630EXPORT_SYMBOL(__lock_page);
@@ -675,24 +634,10 @@ int __lock_page_killable(struct page *page)
675 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 634 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
676 635
677 return __wait_on_bit_lock(page_waitqueue(page), &wait, 636 return __wait_on_bit_lock(page_waitqueue(page), &wait,
678 sync_page_killable, TASK_KILLABLE); 637 sleep_on_page_killable, TASK_KILLABLE);
679} 638}
680EXPORT_SYMBOL_GPL(__lock_page_killable); 639EXPORT_SYMBOL_GPL(__lock_page_killable);
681 640
682/**
683 * __lock_page_nosync - get a lock on the page, without calling sync_page()
684 * @page: the page to lock
685 *
686 * Variant of lock_page that does not require the caller to hold a reference
687 * on the page's mapping.
688 */
689void __lock_page_nosync(struct page *page)
690{
691 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
692 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
693 TASK_UNINTERRUPTIBLE);
694}
695
696int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 641int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
697 unsigned int flags) 642 unsigned int flags)
698{ 643{
@@ -1407,12 +1352,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1407 unsigned long seg = 0; 1352 unsigned long seg = 0;
1408 size_t count; 1353 size_t count;
1409 loff_t *ppos = &iocb->ki_pos; 1354 loff_t *ppos = &iocb->ki_pos;
1355 struct blk_plug plug;
1410 1356
1411 count = 0; 1357 count = 0;
1412 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1358 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1413 if (retval) 1359 if (retval)
1414 return retval; 1360 return retval;
1415 1361
1362 blk_start_plug(&plug);
1363
1416 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1364 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1417 if (filp->f_flags & O_DIRECT) { 1365 if (filp->f_flags & O_DIRECT) {
1418 loff_t size; 1366 loff_t size;
@@ -1485,6 +1433,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1485 break; 1433 break;
1486 } 1434 }
1487out: 1435out:
1436 blk_finish_plug(&plug);
1488 return retval; 1437 return retval;
1489} 1438}
1490EXPORT_SYMBOL(generic_file_aio_read); 1439EXPORT_SYMBOL(generic_file_aio_read);
@@ -2596,11 +2545,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2596{ 2545{
2597 struct file *file = iocb->ki_filp; 2546 struct file *file = iocb->ki_filp;
2598 struct inode *inode = file->f_mapping->host; 2547 struct inode *inode = file->f_mapping->host;
2548 struct blk_plug plug;
2599 ssize_t ret; 2549 ssize_t ret;
2600 2550
2601 BUG_ON(iocb->ki_pos != pos); 2551 BUG_ON(iocb->ki_pos != pos);
2602 2552
2603 mutex_lock(&inode->i_mutex); 2553 mutex_lock(&inode->i_mutex);
2554 blk_start_plug(&plug);
2604 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2555 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2605 mutex_unlock(&inode->i_mutex); 2556 mutex_unlock(&inode->i_mutex);
2606 2557
@@ -2611,6 +2562,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2611 if (err < 0 && ret > 0) 2562 if (err < 0 && ret > 0)
2612 ret = err; 2563 ret = err;
2613 } 2564 }
2565 blk_finish_plug(&plug);
2614 return ret; 2566 return ret;
2615} 2567}
2616EXPORT_SYMBOL(generic_file_aio_write); 2568EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e0af336530c6..37feb9fec228 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -945,7 +945,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
945 collect_procs(ppage, &tokill); 945 collect_procs(ppage, &tokill);
946 946
947 if (hpage != ppage) 947 if (hpage != ppage)
948 lock_page_nosync(ppage); 948 lock_page(ppage);
949 949
950 ret = try_to_unmap(ppage, ttu); 950 ret = try_to_unmap(ppage, ttu);
951 if (ret != SWAP_SUCCESS) 951 if (ret != SWAP_SUCCESS)
@@ -1038,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1038 * Check "just unpoisoned", "filter hit", and 1038 * Check "just unpoisoned", "filter hit", and
1039 * "race with other subpage." 1039 * "race with other subpage."
1040 */ 1040 */
1041 lock_page_nosync(hpage); 1041 lock_page(hpage);
1042 if (!PageHWPoison(hpage) 1042 if (!PageHWPoison(hpage)
1043 || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1043 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1044 || (p != hpage && TestSetPageHWPoison(hpage))) { 1044 || (p != hpage && TestSetPageHWPoison(hpage))) {
@@ -1088,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1088 * It's very difficult to mess with pages currently under IO 1088 * It's very difficult to mess with pages currently under IO
1089 * and in many cases impossible, so we just avoid it here. 1089 * and in many cases impossible, so we just avoid it here.
1090 */ 1090 */
1091 lock_page_nosync(hpage); 1091 lock_page(hpage);
1092 1092
1093 /* 1093 /*
1094 * unpoison always clear PG_hwpoison inside page lock 1094 * unpoison always clear PG_hwpoison inside page lock
@@ -1231,7 +1231,7 @@ int unpoison_memory(unsigned long pfn)
1231 return 0; 1231 return 0;
1232 } 1232 }
1233 1233
1234 lock_page_nosync(page); 1234 lock_page(page);
1235 /* 1235 /*
1236 * This test is racy because PG_hwpoison is set outside of page lock. 1236 * This test is racy because PG_hwpoison is set outside of page lock.
1237 * That's acceptable because that won't trigger kernel panic. Instead, 1237 * That's acceptable because that won't trigger kernel panic. Instead,
diff --git a/mm/nommu.c b/mm/nommu.c
index e629143f9440..cb86e7d5e7f5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1842} 1842}
1843EXPORT_SYMBOL(remap_vmalloc_range); 1843EXPORT_SYMBOL(remap_vmalloc_range);
1844 1844
1845void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1846{
1847}
1848
1849unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, 1845unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1850 unsigned long len, unsigned long pgoff, unsigned long flags) 1846 unsigned long len, unsigned long pgoff, unsigned long flags)
1851{ 1847{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 632b46479c94..31f698862420 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1040,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
1040int generic_writepages(struct address_space *mapping, 1040int generic_writepages(struct address_space *mapping,
1041 struct writeback_control *wbc) 1041 struct writeback_control *wbc)
1042{ 1042{
1043 struct blk_plug plug;
1044 int ret;
1045
1043 /* deal with chardevs and other special file */ 1046 /* deal with chardevs and other special file */
1044 if (!mapping->a_ops->writepage) 1047 if (!mapping->a_ops->writepage)
1045 return 0; 1048 return 0;
1046 1049
1047 return write_cache_pages(mapping, wbc, __writepage, mapping); 1050 blk_start_plug(&plug);
1051 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
1052 blk_finish_plug(&plug);
1053 return ret;
1048} 1054}
1049 1055
1050EXPORT_SYMBOL(generic_writepages); 1056EXPORT_SYMBOL(generic_writepages);
@@ -1251,7 +1257,7 @@ int set_page_dirty_lock(struct page *page)
1251{ 1257{
1252 int ret; 1258 int ret;
1253 1259
1254 lock_page_nosync(page); 1260 lock_page(page);
1255 ret = set_page_dirty(page); 1261 ret = set_page_dirty(page);
1256 unlock_page(page); 1262 unlock_page(page);
1257 return ret; 1263 return ret;
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf469..dc76b4d0611e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
106 goto out; 106 goto out;
107 } 107 }
108 if (wbc->sync_mode == WB_SYNC_ALL) 108 if (wbc->sync_mode == WB_SYNC_ALL)
109 rw |= REQ_SYNC | REQ_UNPLUG; 109 rw |= REQ_SYNC;
110 count_vm_event(PSWPOUT); 110 count_vm_event(PSWPOUT);
111 set_page_writeback(page); 111 set_page_writeback(page);
112 unlock_page(page); 112 unlock_page(page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2d..2c0cc489e288 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
109static int read_pages(struct address_space *mapping, struct file *filp, 109static int read_pages(struct address_space *mapping, struct file *filp,
110 struct list_head *pages, unsigned nr_pages) 110 struct list_head *pages, unsigned nr_pages)
111{ 111{
112 struct blk_plug plug;
112 unsigned page_idx; 113 unsigned page_idx;
113 int ret; 114 int ret;
114 115
116 blk_start_plug(&plug);
117
115 if (mapping->a_ops->readpages) { 118 if (mapping->a_ops->readpages) {
116 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 119 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
117 /* Clean up the remaining pages */ 120 /* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
129 page_cache_release(page); 132 page_cache_release(page);
130 } 133 }
131 ret = 0; 134 ret = 0;
135
132out: 136out:
137 blk_finish_plug(&plug);
138
133 return ret; 139 return ret;
134} 140}
135 141
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping,
554 560
555 /* do read-ahead */ 561 /* do read-ahead */
556 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 562 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
557
558#ifdef CONFIG_BLOCK
559 /*
560 * Normally the current page is !uptodate and lock_page() will be
561 * immediately called to implicitly unplug the device. However this
562 * is not always true for RAID conifgurations, where data arrives
563 * not strictly in their submission order. In this case we need to
564 * explicitly kick off the IO.
565 */
566 if (PageUptodate(page))
567 blk_run_backing_dev(mapping->backing_dev_info, NULL);
568#endif
569} 563}
570EXPORT_SYMBOL_GPL(page_cache_async_readahead); 564EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/shmem.c b/mm/shmem.c
index 91ce9a1024d7..58da7c150ba6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops;
224static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 224static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
225 .ra_pages = 0, /* No readahead */ 225 .ra_pages = 0, /* No readahead */
226 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 226 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
227 .unplug_io_fn = default_unplug_io_fn,
228}; 227};
229 228
230static LIST_HEAD(shmem_swaplist); 229static LIST_HEAD(shmem_swaplist);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5c8cfabbc9bc..46680461785b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
24 24
25/* 25/*
26 * swapper_space is a fiction, retained to simplify the path through 26 * swapper_space is a fiction, retained to simplify the path through
27 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow 27 * vmscan's shrink_page_list.
28 * future use of radix_tree tags in the swap cache.
29 */ 28 */
30static const struct address_space_operations swap_aops = { 29static const struct address_space_operations swap_aops = {
31 .writepage = swap_writepage, 30 .writepage = swap_writepage,
32 .sync_page = block_sync_page,
33 .set_page_dirty = __set_page_dirty_nobuffers, 31 .set_page_dirty = __set_page_dirty_nobuffers,
34 .migratepage = migrate_page, 32 .migratepage = migrate_page,
35}; 33};
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
37static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
38 .name = "swap", 36 .name = "swap",
39 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
40 .unplug_io_fn = swap_unplug_io_fn,
41}; 38};
42 39
43struct address_space swapper_space = { 40struct address_space swapper_space = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 039e61677635..8c6b3ce38f09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
95} 95}
96 96
97/* 97/*
98 * We need this because the bdev->unplug_fn can sleep and we cannot
99 * hold swap_lock while calling the unplug_fn. And swap_lock
100 * cannot be turned into a mutex.
101 */
102static DECLARE_RWSEM(swap_unplug_sem);
103
104void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
105{
106 swp_entry_t entry;
107
108 down_read(&swap_unplug_sem);
109 entry.val = page_private(page);
110 if (PageSwapCache(page)) {
111 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
112 struct backing_dev_info *bdi;
113
114 /*
115 * If the page is removed from swapcache from under us (with a
116 * racy try_to_unuse/swapoff) we need an additional reference
117 * count to avoid reading garbage from page_private(page) above.
118 * If the WARN_ON triggers during a swapoff it maybe the race
119 * condition and it's harmless. However if it triggers without
120 * swapoff it signals a problem.
121 */
122 WARN_ON(page_count(page) <= 1);
123
124 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
125 blk_run_backing_dev(bdi, page);
126 }
127 up_read(&swap_unplug_sem);
128}
129
130/*
131 * swapon tell device that all the old swap contents can be discarded, 98 * swapon tell device that all the old swap contents can be discarded,
132 * to allow the swap device to optimize its wear-levelling. 99 * to allow the swap device to optimize its wear-levelling.
133 */ 100 */
@@ -1662,10 +1629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1662 goto out_dput; 1629 goto out_dput;
1663 } 1630 }
1664 1631
1665 /* wait for any unplug function to finish */
1666 down_write(&swap_unplug_sem);
1667 up_write(&swap_unplug_sem);
1668
1669 destroy_swap_extents(p); 1632 destroy_swap_extents(p);
1670 if (p->flags & SWP_CONTINUED) 1633 if (p->flags & SWP_CONTINUED)
1671 free_swap_count_continuations(p); 1634 free_swap_count_continuations(p);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 060e4c191403..f73b8657c2d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -358,7 +358,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
358static void handle_write_error(struct address_space *mapping, 358static void handle_write_error(struct address_space *mapping,
359 struct page *page, int error) 359 struct page *page, int error)
360{ 360{
361 lock_page_nosync(page); 361 lock_page(page);
362 if (page_mapping(page) == mapping) 362 if (page_mapping(page) == mapping)
363 mapping_set_error(mapping, error); 363 mapping_set_error(mapping, error);
364 unlock_page(page); 364 unlock_page(page);