aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/blkio-controller.txt151
-rw-r--r--block/Kconfig23
-rw-r--r--block/Kconfig.iosched16
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-barrier.c147
-rw-r--r--block/blk-cgroup.c791
-rw-r--r--block/blk-cgroup.h178
-rw-r--r--block/blk-core.c31
-rw-r--r--block/blk-lib.c233
-rw-r--r--block/cfq-iosched.c81
-rw-r--r--block/elevator.c11
-rw-r--r--block/genhd.c2
-rw-r--r--block/ioctl.c2
-rw-r--r--drivers/block/Kconfig22
-rw-r--r--drivers/block/drbd/drbd_bitmap.c21
-rw-r--r--drivers/block/drbd/drbd_int.h151
-rw-r--r--drivers/block/drbd/drbd_main.c158
-rw-r--r--drivers/block/drbd/drbd_nl.c52
-rw-r--r--drivers/block/drbd/drbd_proc.c19
-rw-r--r--drivers/block/drbd/drbd_receiver.c666
-rw-r--r--drivers/block/drbd/drbd_req.c40
-rw-r--r--drivers/block/drbd/drbd_strings.c2
-rw-r--r--drivers/block/drbd/drbd_worker.c206
-rw-r--r--drivers/block/drbd/drbd_wrappers.h16
-rw-r--r--drivers/ide/ide-disk.c40
-rw-r--r--drivers/ide/ide-gd.c11
-rw-r--r--fs/block_dev.c257
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/fs-writeback.c98
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/partitions/acorn.c68
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c13
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c8
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c84
-rw-r--r--fs/partitions/check.h12
-rw-r--r--fs/partitions/efi.c91
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c21
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c4
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c89
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c11
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c4
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c6
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c6
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c6
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c4
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c122
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/splice.c151
-rw-r--r--fs/sync.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--include/linux/backing-dev.h6
-rw-r--r--include/linux/blkdev.h70
-rw-r--r--include/linux/drbd.h5
-rw-r--r--include/linux/drbd_limits.h16
-rw-r--r--include/linux/drbd_nl.h5
-rw-r--r--include/linux/elevator.h6
-rw-r--r--include/linux/fcntl.h6
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/ide.h2
-rw-r--r--include/linux/pipe_fs_i.h13
-rw-r--r--include/linux/splice.h7
-rw-r--r--include/linux/writeback.h18
-rw-r--r--init/Kconfig27
-rw-r--r--kernel/relay.c15
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/trace/trace.c60
-rw-r--r--mm/backing-dev.c15
-rw-r--r--mm/page-writeback.c44
-rw-r--r--mm/swapfile.c9
-rw-r--r--net/core/skbuff.c38
93 files changed, 3423 insertions, 1241 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a42..48e0b21b0059 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
17You can do a very simple testing of running two dd threads in two different 17You can do a very simple testing of running two dd threads in two different
18cgroups. Here is what you can do. 18cgroups. Here is what you can do.
19 19
20- Enable Block IO controller
21 CONFIG_BLK_CGROUP=y
22
20- Enable group scheduling in CFQ 23- Enable group scheduling in CFQ
21 CONFIG_CFQ_GROUP_IOSCHED=y 24 CONFIG_CFQ_GROUP_IOSCHED=y
22 25
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
54 57
55Various user visible config options 58Various user visible config options
56=================================== 59===================================
57CONFIG_CFQ_GROUP_IOSCHED
58 - Enables group scheduling in CFQ. Currently only 1 level of group
59 creation is allowed.
60
61CONFIG_DEBUG_CFQ_IOSCHED
62 - Enables some debugging messages in blktrace. Also creates extra
63 cgroup file blkio.dequeue.
64
65Config options selected automatically
66=====================================
67These config options are not user visible and are selected/deselected
68automatically based on IO scheduler configuration.
69
70CONFIG_BLK_CGROUP 60CONFIG_BLK_CGROUP
71 - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. 61 - Block IO controller.
72 62
73CONFIG_DEBUG_BLK_CGROUP 63CONFIG_DEBUG_BLK_CGROUP
74 - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. 64 - Debug help. Right now some additional stats file show up in cgroup
65 if this option is enabled.
66
67CONFIG_CFQ_GROUP_IOSCHED
68 - Enables group scheduling in CFQ. Currently only 1 level of group
69 creation is allowed.
75 70
76Details of cgroup files 71Details of cgroup files
77======================= 72=======================
78- blkio.weight 73- blkio.weight
79 - Specifies per cgroup weight. 74 - Specifies per cgroup weight. This is default weight of the group
80 75 on all the devices until and unless overridden by per device rule.
76 (See blkio.weight_device).
81 Currently allowed range of weights is from 100 to 1000. 77 Currently allowed range of weights is from 100 to 1000.
82 78
79- blkio.weight_device
80 - One can specify per cgroup per device rules using this interface.
81 These rules override the default value of group weight as specified
82 by blkio.weight.
83
84 Following is the format.
85
86 #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
87 Configure weight=300 on /dev/sdb (8:16) in this cgroup
88 # echo 8:16 300 > blkio.weight_device
89 # cat blkio.weight_device
90 dev weight
91 8:16 300
92
93 Configure weight=500 on /dev/sda (8:0) in this cgroup
94 # echo 8:0 500 > blkio.weight_device
95 # cat blkio.weight_device
96 dev weight
97 8:0 500
98 8:16 300
99
100 Remove specific weight for /dev/sda in this cgroup
101 # echo 8:0 0 > blkio.weight_device
102 # cat blkio.weight_device
103 dev weight
104 8:16 300
105
83- blkio.time 106- blkio.time
84 - disk time allocated to cgroup per device in milliseconds. First 107 - disk time allocated to cgroup per device in milliseconds. First
85 two fields specify the major and minor number of the device and 108 two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
92 third field specifies the number of sectors transferred by the 115 third field specifies the number of sectors transferred by the
93 group to/from the device. 116 group to/from the device.
94 117
118- blkio.io_service_bytes
119 - Number of bytes transferred to/from the disk by the group. These
120 are further divided by the type of operation - read or write, sync
121 or async. First two fields specify the major and minor number of the
122 device, third field specifies the operation type and the fourth field
123 specifies the number of bytes.
124
125- blkio.io_serviced
126 - Number of IOs completed to/from the disk by the group. These
127 are further divided by the type of operation - read or write, sync
128 or async. First two fields specify the major and minor number of the
129 device, third field specifies the operation type and the fourth field
130 specifies the number of IOs.
131
132- blkio.io_service_time
133 - Total amount of time between request dispatch and request completion
134 for the IOs done by this cgroup. This is in nanoseconds to make it
135 meaningful for flash devices too. For devices with queue depth of 1,
136 this time represents the actual service time. When queue_depth > 1,
137 that is no longer true as requests may be served out of order. This
138 may cause the service time for a given IO to include the service time
139 of multiple IOs when served out of order which may result in total
140 io_service_time > actual time elapsed. This time is further divided by
141 the type of operation - read or write, sync or async. First two fields
142 specify the major and minor number of the device, third field
143 specifies the operation type and the fourth field specifies the
144 io_service_time in ns.
145
146- blkio.io_wait_time
147 - Total amount of time the IOs for this cgroup spent waiting in the
148 scheduler queues for service. This can be greater than the total time
149 elapsed since it is cumulative io_wait_time for all IOs. It is not a
150 measure of total time the cgroup spent waiting but rather a measure of
151 the wait_time for its individual IOs. For devices with queue_depth > 1
152 this metric does not include the time spent waiting for service once
153 the IO is dispatched to the device but till it actually gets serviced
154 (there might be a time lag here due to re-ordering of requests by the
155 device). This is in nanoseconds to make it meaningful for flash
156 devices too. This time is further divided by the type of operation -
157 read or write, sync or async. First two fields specify the major and
158 minor number of the device, third field specifies the operation type
159 and the fourth field specifies the io_wait_time in ns.
160
161- blkio.io_merged
162 - Total number of bios/requests merged into requests belonging to this
163 cgroup. This is further divided by the type of operation - read or
164 write, sync or async.
165
166- blkio.io_queued
167 - Total number of requests queued up at any given instant for this
168 cgroup. This is further divided by the type of operation - read or
169 write, sync or async.
170
171- blkio.avg_queue_size
172 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
173 The average queue size for this cgroup over the entire time of this
174 cgroup's existence. Queue size samples are taken each time one of the
175 queues of this cgroup gets a timeslice.
176
177- blkio.group_wait_time
178 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
179 This is the amount of time the cgroup had to wait since it became busy
180 (i.e., went from 0 to 1 request queued) to get a timeslice for one of
181 its queues. This is different from the io_wait_time which is the
182 cumulative total of the amount of time spent by each IO in that cgroup
183 waiting in the scheduler queue. This is in nanoseconds. If this is
184 read when the cgroup is in a waiting (for timeslice) state, the stat
185 will only report the group_wait_time accumulated till the last time it
186 got a timeslice and will not include the current delta.
187
188- blkio.empty_time
189 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
190 This is the amount of time a cgroup spends without any pending
191 requests when not being served, i.e., it does not include any time
192 spent idling for one of the queues of the cgroup. This is in
193 nanoseconds. If this is read when the cgroup is in an empty state,
194 the stat will only report the empty_time accumulated till the last
195 time it had a pending request and will not include the current delta.
196
197- blkio.idle_time
198 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
199 This is the amount of time spent by the IO scheduler idling for a
200 given cgroup in anticipation of a better request than the exising ones
201 from other queues/cgroups. This is in nanoseconds. If this is read
202 when the cgroup is in an idling state, the stat will only report the
203 idle_time accumulated till the last idle period and will not include
204 the current delta.
205
95- blkio.dequeue 206- blkio.dequeue
96 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This 207 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
97 gives the statistics about how many a times a group was dequeued 208 gives the statistics about how many a times a group was dequeued
98 from service tree of the device. First two fields specify the major 209 from service tree of the device. First two fields specify the major
99 and minor number of the device and third field specifies the number 210 and minor number of the device and third field specifies the number
100 of times a group was dequeued from a particular device. 211 of times a group was dequeued from a particular device.
101 212
213- blkio.reset_stats
214 - Writing an int to this file will result in resetting all the stats
215 for that cgroup.
216
102CFQ sysfs tunable 217CFQ sysfs tunable
103================= 218=================
104/sys/block/<disk>/queue/iosched/group_isolation 219/sys/block/<disk>/queue/iosched/group_isolation
diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4d94bb..9be0b56eaee1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_CGROUP
81 tristate "Block cgroup support"
82 depends on CGROUPS
83 depends on CFQ_GROUP_IOSCHED
84 default n
85 ---help---
86 Generic block IO controller cgroup interface. This is the common
87 cgroup interface which should be used by various IO controlling
88 policies.
89
90 Currently, CFQ IO scheduler uses it to recognize task groups and
91 control disk bandwidth allocation (proportional time slice allocation)
92 to such task groups.
93
94config DEBUG_BLK_CGROUP
95 bool
96 depends on BLK_CGROUP
97 default n
98 ---help---
99 Enable some debugging help. Currently it stores the cgroup path
100 in the blk group which can be used by cfq for tracing various
101 group related activity.
102
103endif # BLOCK 80endif # BLOCK
104 81
105config BLOCK_COMPAT 82config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fc71cf071fb2..3199b76f795d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
23 23
24config IOSCHED_CFQ 24config IOSCHED_CFQ
25 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
26 select BLK_CGROUP if CFQ_GROUP_IOSCHED 26 # If BLK_CGROUP is a module, CFQ has to be built as module.
27 depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
27 default y 28 default y
28 ---help--- 29 ---help---
29 The CFQ I/O scheduler tries to distribute bandwidth equally 30 The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
33 34
34 This is the default I/O scheduler. 35 This is the default I/O scheduler.
35 36
37 Note: If BLK_CGROUP=m, then CFQ can be built only as module.
38
36config CFQ_GROUP_IOSCHED 39config CFQ_GROUP_IOSCHED
37 bool "CFQ Group Scheduling support" 40 bool "CFQ Group Scheduling support"
38 depends on IOSCHED_CFQ && CGROUPS 41 depends on IOSCHED_CFQ && BLK_CGROUP
39 default n 42 default n
40 ---help--- 43 ---help---
41 Enable group IO scheduling in CFQ. 44 Enable group IO scheduling in CFQ.
42 45
43config DEBUG_CFQ_IOSCHED
44 bool "Debug CFQ Scheduling"
45 depends on CFQ_GROUP_IOSCHED
46 select DEBUG_BLK_CGROUP
47 default n
48 ---help---
49 Enable CFQ IO scheduling debugging in CFQ. Currently it makes
50 blktrace output more verbose.
51
52choice 46choice
53 prompt "Default I/O scheduler" 47 prompt "Default I/O scheduler"
54 default DEFAULT_CFQ 48 default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6e..0bb499a739cd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677f..0d710c9d403b 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
286 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 286 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
287 clear_bit(BIO_UPTODATE, &bio->bi_flags); 287 clear_bit(BIO_UPTODATE, &bio->bi_flags);
288 } 288 }
289 289 if (bio->bi_private)
290 complete(bio->bi_private); 290 complete(bio->bi_private);
291 bio_put(bio);
291} 292}
292 293
293/** 294/**
294 * blkdev_issue_flush - queue a flush 295 * blkdev_issue_flush - queue a flush
295 * @bdev: blockdev to issue flush for 296 * @bdev: blockdev to issue flush for
297 * @gfp_mask: memory allocation flags (for bio_alloc)
296 * @error_sector: error sector 298 * @error_sector: error sector
299 * @flags: BLKDEV_IFL_* flags to control behaviour
297 * 300 *
298 * Description: 301 * Description:
299 * Issue a flush for the block device in question. Caller can supply 302 * Issue a flush for the block device in question. Caller can supply
300 * room for storing the error offset in case of a flush error, if they 303 * room for storing the error offset in case of a flush error, if they
301 * wish to. 304 * wish to. If WAIT flag is not passed then caller may check only what
305 * request was pushed in some internal queue for later handling.
302 */ 306 */
303int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 307int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
308 sector_t *error_sector, unsigned long flags)
304{ 309{
305 DECLARE_COMPLETION_ONSTACK(wait); 310 DECLARE_COMPLETION_ONSTACK(wait);
306 struct request_queue *q; 311 struct request_queue *q;
307 struct bio *bio; 312 struct bio *bio;
308 int ret; 313 int ret = 0;
309 314
310 if (bdev->bd_disk == NULL) 315 if (bdev->bd_disk == NULL)
311 return -ENXIO; 316 return -ENXIO;
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
314 if (!q) 319 if (!q)
315 return -ENXIO; 320 return -ENXIO;
316 321
317 bio = bio_alloc(GFP_KERNEL, 0); 322 bio = bio_alloc(gfp_mask, 0);
318 bio->bi_end_io = bio_end_empty_barrier; 323 bio->bi_end_io = bio_end_empty_barrier;
319 bio->bi_private = &wait;
320 bio->bi_bdev = bdev; 324 bio->bi_bdev = bdev;
321 submit_bio(WRITE_BARRIER, bio); 325 if (test_bit(BLKDEV_WAIT, &flags))
322 326 bio->bi_private = &wait;
323 wait_for_completion(&wait);
324 327
325 /* 328 bio_get(bio);
326 * The driver must store the error location in ->bi_sector, if 329 submit_bio(WRITE_BARRIER, bio);
327 * it supports it. For non-stacked drivers, this should be copied 330 if (test_bit(BLKDEV_WAIT, &flags)) {
328 * from blk_rq_pos(rq). 331 wait_for_completion(&wait);
329 */ 332 /*
330 if (error_sector) 333 * The driver must store the error location in ->bi_sector, if
331 *error_sector = bio->bi_sector; 334 * it supports it. For non-stacked drivers, this should be
335 * copied from blk_rq_pos(rq).
336 */
337 if (error_sector)
338 *error_sector = bio->bi_sector;
339 }
332 340
333 ret = 0;
334 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 341 if (bio_flagged(bio, BIO_EOPNOTSUPP))
335 ret = -EOPNOTSUPP; 342 ret = -EOPNOTSUPP;
336 else if (!bio_flagged(bio, BIO_UPTODATE)) 343 else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
340 return ret; 347 return ret;
341} 348}
342EXPORT_SYMBOL(blkdev_issue_flush); 349EXPORT_SYMBOL(blkdev_issue_flush);
343
344static void blkdev_discard_end_io(struct bio *bio, int err)
345{
346 if (err) {
347 if (err == -EOPNOTSUPP)
348 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
349 clear_bit(BIO_UPTODATE, &bio->bi_flags);
350 }
351
352 if (bio->bi_private)
353 complete(bio->bi_private);
354 __free_page(bio_page(bio));
355
356 bio_put(bio);
357}
358
359/**
360 * blkdev_issue_discard - queue a discard
361 * @bdev: blockdev to issue discard for
362 * @sector: start sector
363 * @nr_sects: number of sectors to discard
364 * @gfp_mask: memory allocation flags (for bio_alloc)
365 * @flags: DISCARD_FL_* flags to control behaviour
366 *
367 * Description:
368 * Issue a discard request for the sectors in question.
369 */
370int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
371 sector_t nr_sects, gfp_t gfp_mask, int flags)
372{
373 DECLARE_COMPLETION_ONSTACK(wait);
374 struct request_queue *q = bdev_get_queue(bdev);
375 int type = flags & DISCARD_FL_BARRIER ?
376 DISCARD_BARRIER : DISCARD_NOBARRIER;
377 struct bio *bio;
378 struct page *page;
379 int ret = 0;
380
381 if (!q)
382 return -ENXIO;
383
384 if (!blk_queue_discard(q))
385 return -EOPNOTSUPP;
386
387 while (nr_sects && !ret) {
388 unsigned int sector_size = q->limits.logical_block_size;
389 unsigned int max_discard_sectors =
390 min(q->limits.max_discard_sectors, UINT_MAX >> 9);
391
392 bio = bio_alloc(gfp_mask, 1);
393 if (!bio)
394 goto out;
395 bio->bi_sector = sector;
396 bio->bi_end_io = blkdev_discard_end_io;
397 bio->bi_bdev = bdev;
398 if (flags & DISCARD_FL_WAIT)
399 bio->bi_private = &wait;
400
401 /*
402 * Add a zeroed one-sector payload as that's what
403 * our current implementations need. If we'll ever need
404 * more the interface will need revisiting.
405 */
406 page = alloc_page(gfp_mask | __GFP_ZERO);
407 if (!page)
408 goto out_free_bio;
409 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
410 goto out_free_page;
411
412 /*
413 * And override the bio size - the way discard works we
414 * touch many more blocks on disk than the actual payload
415 * length.
416 */
417 if (nr_sects > max_discard_sectors) {
418 bio->bi_size = max_discard_sectors << 9;
419 nr_sects -= max_discard_sectors;
420 sector += max_discard_sectors;
421 } else {
422 bio->bi_size = nr_sects << 9;
423 nr_sects = 0;
424 }
425
426 bio_get(bio);
427 submit_bio(type, bio);
428
429 if (flags & DISCARD_FL_WAIT)
430 wait_for_completion(&wait);
431
432 if (bio_flagged(bio, BIO_EOPNOTSUPP))
433 ret = -EOPNOTSUPP;
434 else if (!bio_flagged(bio, BIO_UPTODATE))
435 ret = -EIO;
436 bio_put(bio);
437 }
438 return ret;
439out_free_page:
440 __free_page(page);
441out_free_bio:
442 bio_put(bio);
443out:
444 return -ENOMEM;
445}
446EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2cc682b860ea..a6809645d212 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,8 +15,12 @@
15#include <linux/kdev_t.h> 15#include <linux/kdev_t.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/err.h> 17#include <linux/err.h>
18#include <linux/blkdev.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include "blk-cgroup.h" 20#include "blk-cgroup.h"
21#include <linux/genhd.h>
22
23#define MAX_KEY_LEN 100
20 24
21static DEFINE_SPINLOCK(blkio_list_lock); 25static DEFINE_SPINLOCK(blkio_list_lock);
22static LIST_HEAD(blkio_list); 26static LIST_HEAD(blkio_list);
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = {
49}; 53};
50EXPORT_SYMBOL_GPL(blkio_subsys); 54EXPORT_SYMBOL_GPL(blkio_subsys);
51 55
56static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
57 struct blkio_policy_node *pn)
58{
59 list_add(&pn->node, &blkcg->policy_list);
60}
61
62/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{
65 list_del(&pn->node);
66}
67
68/* Must be called with blkcg->lock held */
69static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
71{
72 struct blkio_policy_node *pn;
73
74 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev)
76 return pn;
77 }
78
79 return NULL;
80}
81
52struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 82struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
53{ 83{
54 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 84 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
56} 86}
57EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
58 88
59void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 89/*
60 unsigned long time, unsigned long sectors) 90 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held.
92 */
93static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
94 bool sync)
95{
96 if (direction)
97 stat[BLKIO_STAT_WRITE] += add;
98 else
99 stat[BLKIO_STAT_READ] += add;
100 if (sync)
101 stat[BLKIO_STAT_SYNC] += add;
102 else
103 stat[BLKIO_STAT_ASYNC] += add;
104}
105
106/*
107 * Decrements the appropriate stat variable if non-zero depending on the
108 * request type. Panics on value being zero.
109 * This should be called with the blkg->stats_lock held.
110 */
111static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
112{
113 if (direction) {
114 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
115 stat[BLKIO_STAT_WRITE]--;
116 } else {
117 BUG_ON(stat[BLKIO_STAT_READ] == 0);
118 stat[BLKIO_STAT_READ]--;
119 }
120 if (sync) {
121 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
122 stat[BLKIO_STAT_SYNC]--;
123 } else {
124 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
125 stat[BLKIO_STAT_ASYNC]--;
126 }
127}
128
129#ifdef CONFIG_DEBUG_BLK_CGROUP
130/* This should be called with the blkg->stats_lock held. */
131static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
132 struct blkio_group *curr_blkg)
133{
134 if (blkio_blkg_waiting(&blkg->stats))
135 return;
136 if (blkg == curr_blkg)
137 return;
138 blkg->stats.start_group_wait_time = sched_clock();
139 blkio_mark_blkg_waiting(&blkg->stats);
140}
141
142/* This should be called with the blkg->stats_lock held. */
143static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
144{
145 unsigned long long now;
146
147 if (!blkio_blkg_waiting(stats))
148 return;
149
150 now = sched_clock();
151 if (time_after64(now, stats->start_group_wait_time))
152 stats->group_wait_time += now - stats->start_group_wait_time;
153 blkio_clear_blkg_waiting(stats);
154}
155
156/* This should be called with the blkg->stats_lock held. */
157static void blkio_end_empty_time(struct blkio_group_stats *stats)
158{
159 unsigned long long now;
160
161 if (!blkio_blkg_empty(stats))
162 return;
163
164 now = sched_clock();
165 if (time_after64(now, stats->start_empty_time))
166 stats->empty_time += now - stats->start_empty_time;
167 blkio_clear_blkg_empty(stats);
168}
169
170void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
171{
172 unsigned long flags;
173
174 spin_lock_irqsave(&blkg->stats_lock, flags);
175 BUG_ON(blkio_blkg_idling(&blkg->stats));
176 blkg->stats.start_idle_time = sched_clock();
177 blkio_mark_blkg_idling(&blkg->stats);
178 spin_unlock_irqrestore(&blkg->stats_lock, flags);
179}
180EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
181
182void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
183{
184 unsigned long flags;
185 unsigned long long now;
186 struct blkio_group_stats *stats;
187
188 spin_lock_irqsave(&blkg->stats_lock, flags);
189 stats = &blkg->stats;
190 if (blkio_blkg_idling(stats)) {
191 now = sched_clock();
192 if (time_after64(now, stats->start_idle_time))
193 stats->idle_time += now - stats->start_idle_time;
194 blkio_clear_blkg_idling(stats);
195 }
196 spin_unlock_irqrestore(&blkg->stats_lock, flags);
197}
198EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
199
200void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
201{
202 unsigned long flags;
203 struct blkio_group_stats *stats;
204
205 spin_lock_irqsave(&blkg->stats_lock, flags);
206 stats = &blkg->stats;
207 stats->avg_queue_size_sum +=
208 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
209 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
210 stats->avg_queue_size_samples++;
211 blkio_update_group_wait_time(stats);
212 spin_unlock_irqrestore(&blkg->stats_lock, flags);
213}
214EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
215
216void blkiocg_set_start_empty_time(struct blkio_group *blkg)
217{
218 unsigned long flags;
219 struct blkio_group_stats *stats;
220
221 spin_lock_irqsave(&blkg->stats_lock, flags);
222 stats = &blkg->stats;
223
224 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
225 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
226 spin_unlock_irqrestore(&blkg->stats_lock, flags);
227 return;
228 }
229
230 /*
231 * group is already marked empty. This can happen if cfqq got new
232 * request in parent group and moved to this group while being added
233 * to service tree. Just ignore the event and move on.
234 */
235 if(blkio_blkg_empty(stats)) {
236 spin_unlock_irqrestore(&blkg->stats_lock, flags);
237 return;
238 }
239
240 stats->start_empty_time = sched_clock();
241 blkio_mark_blkg_empty(stats);
242 spin_unlock_irqrestore(&blkg->stats_lock, flags);
243}
244EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
245
246void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
247 unsigned long dequeue)
248{
249 blkg->stats.dequeue += dequeue;
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
252#else
253static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
254 struct blkio_group *curr_blkg) {}
255static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
256#endif
257
258void blkiocg_update_io_add_stats(struct blkio_group *blkg,
259 struct blkio_group *curr_blkg, bool direction,
260 bool sync)
261{
262 unsigned long flags;
263
264 spin_lock_irqsave(&blkg->stats_lock, flags);
265 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
266 sync);
267 blkio_end_empty_time(&blkg->stats);
268 blkio_set_start_group_wait_time(blkg, curr_blkg);
269 spin_unlock_irqrestore(&blkg->stats_lock, flags);
270}
271EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
272
273void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
274 bool direction, bool sync)
275{
276 unsigned long flags;
277
278 spin_lock_irqsave(&blkg->stats_lock, flags);
279 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
280 direction, sync);
281 spin_unlock_irqrestore(&blkg->stats_lock, flags);
282}
283EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
284
285void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&blkg->stats_lock, flags);
290 blkg->stats.time += time;
291 spin_unlock_irqrestore(&blkg->stats_lock, flags);
292}
293EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
294
295void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
296 uint64_t bytes, bool direction, bool sync)
61{ 297{
62 blkg->time += time; 298 struct blkio_group_stats *stats;
63 blkg->sectors += sectors; 299 unsigned long flags;
300
301 spin_lock_irqsave(&blkg->stats_lock, flags);
302 stats = &blkg->stats;
303 stats->sectors += bytes >> 9;
304 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
305 sync);
306 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
307 direction, sync);
308 spin_unlock_irqrestore(&blkg->stats_lock, flags);
64} 309}
65EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); 310EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
311
312void blkiocg_update_completion_stats(struct blkio_group *blkg,
313 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
314{
315 struct blkio_group_stats *stats;
316 unsigned long flags;
317 unsigned long long now = sched_clock();
318
319 spin_lock_irqsave(&blkg->stats_lock, flags);
320 stats = &blkg->stats;
321 if (time_after64(now, io_start_time))
322 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
323 now - io_start_time, direction, sync);
324 if (time_after64(io_start_time, start_time))
325 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
326 io_start_time - start_time, direction, sync);
327 spin_unlock_irqrestore(&blkg->stats_lock, flags);
328}
329EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
330
331void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
332 bool sync)
333{
334 unsigned long flags;
335
336 spin_lock_irqsave(&blkg->stats_lock, flags);
337 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
338 sync);
339 spin_unlock_irqrestore(&blkg->stats_lock, flags);
340}
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
66 342
67void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
68 struct blkio_group *blkg, void *key, dev_t dev) 344 struct blkio_group *blkg, void *key, dev_t dev)
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
70 unsigned long flags; 346 unsigned long flags;
71 347
72 spin_lock_irqsave(&blkcg->lock, flags); 348 spin_lock_irqsave(&blkcg->lock, flags);
349 spin_lock_init(&blkg->stats_lock);
73 rcu_assign_pointer(blkg->key, key); 350 rcu_assign_pointer(blkg->key, key);
74 blkg->blkcg_id = css_id(&blkcg->css); 351 blkg->blkcg_id = css_id(&blkcg->css);
75 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
76 spin_unlock_irqrestore(&blkcg->lock, flags); 353 spin_unlock_irqrestore(&blkcg->lock, flags);
77#ifdef CONFIG_DEBUG_BLK_CGROUP
78 /* Need to take css reference ? */ 354 /* Need to take css reference ? */
79 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
80#endif
81 blkg->dev = dev; 356 blkg->dev = dev;
82} 357}
83EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 358EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
@@ -101,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
101 376
102 rcu_read_lock(); 377 rcu_read_lock();
103 css = css_lookup(&blkio_subsys, blkg->blkcg_id); 378 css = css_lookup(&blkio_subsys, blkg->blkcg_id);
104 if (!css) 379 if (css) {
105 goto out; 380 blkcg = container_of(css, struct blkio_cgroup, css);
106 381 spin_lock_irqsave(&blkcg->lock, flags);
107 blkcg = container_of(css, struct blkio_cgroup, css); 382 if (!hlist_unhashed(&blkg->blkcg_node)) {
108 spin_lock_irqsave(&blkcg->lock, flags); 383 __blkiocg_del_blkio_group(blkg);
109 if (!hlist_unhashed(&blkg->blkcg_node)) { 384 ret = 0;
110 __blkiocg_del_blkio_group(blkg); 385 }
111 ret = 0; 386 spin_unlock_irqrestore(&blkcg->lock, flags);
112 } 387 }
113 spin_unlock_irqrestore(&blkcg->lock, flags); 388
114out:
115 rcu_read_unlock(); 389 rcu_read_unlock();
116 return ret; 390 return ret;
117} 391}
@@ -154,6 +428,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
154 struct blkio_group *blkg; 428 struct blkio_group *blkg;
155 struct hlist_node *n; 429 struct hlist_node *n;
156 struct blkio_policy_type *blkiop; 430 struct blkio_policy_type *blkiop;
431 struct blkio_policy_node *pn;
157 432
158 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
159 return -EINVAL; 434 return -EINVAL;
@@ -162,7 +437,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
162 spin_lock(&blkio_list_lock); 437 spin_lock(&blkio_list_lock);
163 spin_lock_irq(&blkcg->lock); 438 spin_lock_irq(&blkcg->lock);
164 blkcg->weight = (unsigned int)val; 439 blkcg->weight = (unsigned int)val;
440
165 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442 pn = blkio_policy_search_node(blkcg, blkg->dev);
443
444 if (pn)
445 continue;
446
166 list_for_each_entry(blkiop, &blkio_list, list) 447 list_for_each_entry(blkiop, &blkio_list, list)
167 blkiop->ops.blkio_update_group_weight_fn(blkg, 448 blkiop->ops.blkio_update_group_weight_fn(blkg,
168 blkcg->weight); 449 blkcg->weight);
@@ -172,13 +453,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
172 return 0; 453 return 0;
173} 454}
174 455
175#define SHOW_FUNCTION_PER_GROUP(__VAR) \ 456static int
457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458{
459 struct blkio_cgroup *blkcg;
460 struct blkio_group *blkg;
461 struct blkio_group_stats *stats;
462 struct hlist_node *n;
463 uint64_t queued[BLKIO_STAT_TOTAL];
464 int i;
465#ifdef CONFIG_DEBUG_BLK_CGROUP
466 bool idling, waiting, empty;
467 unsigned long long now = sched_clock();
468#endif
469
470 blkcg = cgroup_to_blkio_cgroup(cgroup);
471 spin_lock_irq(&blkcg->lock);
472 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
473 spin_lock(&blkg->stats_lock);
474 stats = &blkg->stats;
475#ifdef CONFIG_DEBUG_BLK_CGROUP
476 idling = blkio_blkg_idling(stats);
477 waiting = blkio_blkg_waiting(stats);
478 empty = blkio_blkg_empty(stats);
479#endif
480 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
481 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
482 memset(stats, 0, sizeof(struct blkio_group_stats));
483 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
484 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
485#ifdef CONFIG_DEBUG_BLK_CGROUP
486 if (idling) {
487 blkio_mark_blkg_idling(stats);
488 stats->start_idle_time = now;
489 }
490 if (waiting) {
491 blkio_mark_blkg_waiting(stats);
492 stats->start_group_wait_time = now;
493 }
494 if (empty) {
495 blkio_mark_blkg_empty(stats);
496 stats->start_empty_time = now;
497 }
498#endif
499 spin_unlock(&blkg->stats_lock);
500 }
501 spin_unlock_irq(&blkcg->lock);
502 return 0;
503}
504
505static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
506 int chars_left, bool diskname_only)
507{
508 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
509 chars_left -= strlen(str);
510 if (chars_left <= 0) {
511 printk(KERN_WARNING
512 "Possibly incorrect cgroup stat display format");
513 return;
514 }
515 if (diskname_only)
516 return;
517 switch (type) {
518 case BLKIO_STAT_READ:
519 strlcat(str, " Read", chars_left);
520 break;
521 case BLKIO_STAT_WRITE:
522 strlcat(str, " Write", chars_left);
523 break;
524 case BLKIO_STAT_SYNC:
525 strlcat(str, " Sync", chars_left);
526 break;
527 case BLKIO_STAT_ASYNC:
528 strlcat(str, " Async", chars_left);
529 break;
530 case BLKIO_STAT_TOTAL:
531 strlcat(str, " Total", chars_left);
532 break;
533 default:
534 strlcat(str, " Invalid", chars_left);
535 }
536}
537
538static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
539 struct cgroup_map_cb *cb, dev_t dev)
540{
541 blkio_get_key_name(0, dev, str, chars_left, true);
542 cb->fill(cb, str, val);
543 return val;
544}
545
546/* This should be called with blkg->stats_lock held */
547static uint64_t blkio_get_stat(struct blkio_group *blkg,
548 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
549{
550 uint64_t disk_total;
551 char key_str[MAX_KEY_LEN];
552 enum stat_sub_type sub_type;
553
554 if (type == BLKIO_STAT_TIME)
555 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
556 blkg->stats.time, cb, dev);
557 if (type == BLKIO_STAT_SECTORS)
558 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
559 blkg->stats.sectors, cb, dev);
560#ifdef CONFIG_DEBUG_BLK_CGROUP
561 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
562 uint64_t sum = blkg->stats.avg_queue_size_sum;
563 uint64_t samples = blkg->stats.avg_queue_size_samples;
564 if (samples)
565 do_div(sum, samples);
566 else
567 sum = 0;
568 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
569 }
570 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
571 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
572 blkg->stats.group_wait_time, cb, dev);
573 if (type == BLKIO_STAT_IDLE_TIME)
574 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
575 blkg->stats.idle_time, cb, dev);
576 if (type == BLKIO_STAT_EMPTY_TIME)
577 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
578 blkg->stats.empty_time, cb, dev);
579 if (type == BLKIO_STAT_DEQUEUE)
580 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
581 blkg->stats.dequeue, cb, dev);
582#endif
583
584 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
585 sub_type++) {
586 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
587 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
588 }
589 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
590 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
591 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
592 cb->fill(cb, key_str, disk_total);
593 return disk_total;
594}
595
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
176static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ 597static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
177 struct cftype *cftype, struct seq_file *m) \ 598 struct cftype *cftype, struct cgroup_map_cb *cb) \
178{ \ 599{ \
179 struct blkio_cgroup *blkcg; \ 600 struct blkio_cgroup *blkcg; \
180 struct blkio_group *blkg; \ 601 struct blkio_group *blkg; \
181 struct hlist_node *n; \ 602 struct hlist_node *n; \
603 uint64_t cgroup_total = 0; \
182 \ 604 \
183 if (!cgroup_lock_live_group(cgroup)) \ 605 if (!cgroup_lock_live_group(cgroup)) \
184 return -ENODEV; \ 606 return -ENODEV; \
@@ -186,50 +608,293 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
186 blkcg = cgroup_to_blkio_cgroup(cgroup); \ 608 blkcg = cgroup_to_blkio_cgroup(cgroup); \
187 rcu_read_lock(); \ 609 rcu_read_lock(); \
188 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ 610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
189 if (blkg->dev) \ 611 if (blkg->dev) { \
190 seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ 612 spin_lock_irq(&blkg->stats_lock); \
191 MINOR(blkg->dev), blkg->__VAR); \ 613 cgroup_total += blkio_get_stat(blkg, cb, \
614 blkg->dev, type); \
615 spin_unlock_irq(&blkg->stats_lock); \
616 } \
192 } \ 617 } \
618 if (show_total) \
619 cb->fill(cb, "Total", cgroup_total); \
193 rcu_read_unlock(); \ 620 rcu_read_unlock(); \
194 cgroup_unlock(); \ 621 cgroup_unlock(); \
195 return 0; \ 622 return 0; \
196} 623}
197 624
198SHOW_FUNCTION_PER_GROUP(time); 625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
199SHOW_FUNCTION_PER_GROUP(sectors); 626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
200#ifdef CONFIG_DEBUG_BLK_CGROUP 633#ifdef CONFIG_DEBUG_BLK_CGROUP
201SHOW_FUNCTION_PER_GROUP(dequeue); 634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
202#endif 639#endif
203#undef SHOW_FUNCTION_PER_GROUP 640#undef SHOW_FUNCTION_PER_GROUP
204 641
205#ifdef CONFIG_DEBUG_BLK_CGROUP 642static int blkio_check_dev_num(dev_t dev)
206void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
207 unsigned long dequeue)
208{ 643{
209 blkg->dequeue += dequeue; 644 int part = 0;
645 struct gendisk *disk;
646
647 disk = get_gendisk(dev, &part);
648 if (!disk || part)
649 return -ENODEV;
650
651 return 0;
652}
653
654static int blkio_policy_parse_and_set(char *buf,
655 struct blkio_policy_node *newpn)
656{
657 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 int ret;
659 unsigned long major, minor, temp;
660 int i = 0;
661 dev_t dev;
662
663 memset(s, 0, sizeof(s));
664
665 while ((p = strsep(&buf, " ")) != NULL) {
666 if (!*p)
667 continue;
668
669 s[i++] = p;
670
671 /* Prevent from inputing too many things */
672 if (i == 3)
673 break;
674 }
675
676 if (i != 2)
677 return -EINVAL;
678
679 p = strsep(&s[0], ":");
680 if (p != NULL)
681 major_s = p;
682 else
683 return -EINVAL;
684
685 minor_s = s[0];
686 if (!minor_s)
687 return -EINVAL;
688
689 ret = strict_strtoul(major_s, 10, &major);
690 if (ret)
691 return -EINVAL;
692
693 ret = strict_strtoul(minor_s, 10, &minor);
694 if (ret)
695 return -EINVAL;
696
697 dev = MKDEV(major, minor);
698
699 ret = blkio_check_dev_num(dev);
700 if (ret)
701 return ret;
702
703 newpn->dev = dev;
704
705 if (s[1] == NULL)
706 return -EINVAL;
707
708 ret = strict_strtoul(s[1], 10, &temp);
709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
710 temp > BLKIO_WEIGHT_MAX)
711 return -EINVAL;
712
713 newpn->weight = temp;
714
715 return 0;
716}
717
718unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
719 dev_t dev)
720{
721 struct blkio_policy_node *pn;
722
723 pn = blkio_policy_search_node(blkcg, dev);
724 if (pn)
725 return pn->weight;
726 else
727 return blkcg->weight;
728}
729EXPORT_SYMBOL_GPL(blkcg_get_weight);
730
731
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
733 const char *buffer)
734{
735 int ret = 0;
736 char *buf;
737 struct blkio_policy_node *newpn, *pn;
738 struct blkio_cgroup *blkcg;
739 struct blkio_group *blkg;
740 int keep_newpn = 0;
741 struct hlist_node *n;
742 struct blkio_policy_type *blkiop;
743
744 buf = kstrdup(buffer, GFP_KERNEL);
745 if (!buf)
746 return -ENOMEM;
747
748 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
749 if (!newpn) {
750 ret = -ENOMEM;
751 goto free_buf;
752 }
753
754 ret = blkio_policy_parse_and_set(buf, newpn);
755 if (ret)
756 goto free_newpn;
757
758 blkcg = cgroup_to_blkio_cgroup(cgrp);
759
760 spin_lock_irq(&blkcg->lock);
761
762 pn = blkio_policy_search_node(blkcg, newpn->dev);
763 if (!pn) {
764 if (newpn->weight != 0) {
765 blkio_policy_insert_node(blkcg, newpn);
766 keep_newpn = 1;
767 }
768 spin_unlock_irq(&blkcg->lock);
769 goto update_io_group;
770 }
771
772 if (newpn->weight == 0) {
773 /* weight == 0 means deleteing a specific weight */
774 blkio_policy_delete_node(pn);
775 spin_unlock_irq(&blkcg->lock);
776 goto update_io_group;
777 }
778 spin_unlock_irq(&blkcg->lock);
779
780 pn->weight = newpn->weight;
781
782update_io_group:
783 /* update weight for each cfqg */
784 spin_lock(&blkio_list_lock);
785 spin_lock_irq(&blkcg->lock);
786
787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 if (newpn->dev == blkg->dev) {
789 list_for_each_entry(blkiop, &blkio_list, list)
790 blkiop->ops.blkio_update_group_weight_fn(blkg,
791 newpn->weight ?
792 newpn->weight :
793 blkcg->weight);
794 }
795 }
796
797 spin_unlock_irq(&blkcg->lock);
798 spin_unlock(&blkio_list_lock);
799
800free_newpn:
801 if (!keep_newpn)
802 kfree(newpn);
803free_buf:
804 kfree(buf);
805 return ret;
806}
807
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
809 struct seq_file *m)
810{
811 struct blkio_cgroup *blkcg;
812 struct blkio_policy_node *pn;
813
814 seq_printf(m, "dev\tweight\n");
815
816 blkcg = cgroup_to_blkio_cgroup(cgrp);
817 if (!list_empty(&blkcg->policy_list)) {
818 spin_lock_irq(&blkcg->lock);
819 list_for_each_entry(pn, &blkcg->policy_list, node) {
820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
821 MINOR(pn->dev), pn->weight);
822 }
823 spin_unlock_irq(&blkcg->lock);
824 }
825
826 return 0;
210} 827}
211EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
212#endif
213 828
214struct cftype blkio_files[] = { 829struct cftype blkio_files[] = {
215 { 830 {
831 .name = "weight_device",
832 .read_seq_string = blkiocg_weight_device_read,
833 .write_string = blkiocg_weight_device_write,
834 .max_write_len = 256,
835 },
836 {
216 .name = "weight", 837 .name = "weight",
217 .read_u64 = blkiocg_weight_read, 838 .read_u64 = blkiocg_weight_read,
218 .write_u64 = blkiocg_weight_write, 839 .write_u64 = blkiocg_weight_write,
219 }, 840 },
220 { 841 {
221 .name = "time", 842 .name = "time",
222 .read_seq_string = blkiocg_time_read, 843 .read_map = blkiocg_time_read,
223 }, 844 },
224 { 845 {
225 .name = "sectors", 846 .name = "sectors",
226 .read_seq_string = blkiocg_sectors_read, 847 .read_map = blkiocg_sectors_read,
848 },
849 {
850 .name = "io_service_bytes",
851 .read_map = blkiocg_io_service_bytes_read,
852 },
853 {
854 .name = "io_serviced",
855 .read_map = blkiocg_io_serviced_read,
856 },
857 {
858 .name = "io_service_time",
859 .read_map = blkiocg_io_service_time_read,
860 },
861 {
862 .name = "io_wait_time",
863 .read_map = blkiocg_io_wait_time_read,
864 },
865 {
866 .name = "io_merged",
867 .read_map = blkiocg_io_merged_read,
868 },
869 {
870 .name = "io_queued",
871 .read_map = blkiocg_io_queued_read,
872 },
873 {
874 .name = "reset_stats",
875 .write_u64 = blkiocg_reset_stats,
227 }, 876 },
228#ifdef CONFIG_DEBUG_BLK_CGROUP 877#ifdef CONFIG_DEBUG_BLK_CGROUP
229 { 878 {
879 .name = "avg_queue_size",
880 .read_map = blkiocg_avg_queue_size_read,
881 },
882 {
883 .name = "group_wait_time",
884 .read_map = blkiocg_group_wait_time_read,
885 },
886 {
887 .name = "idle_time",
888 .read_map = blkiocg_idle_time_read,
889 },
890 {
891 .name = "empty_time",
892 .read_map = blkiocg_empty_time_read,
893 },
894 {
230 .name = "dequeue", 895 .name = "dequeue",
231 .read_seq_string = blkiocg_dequeue_read, 896 .read_map = blkiocg_dequeue_read,
232 }, 897 },
233#endif 898#endif
234}; 899};
235 900
@@ -246,37 +911,42 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
246 struct blkio_group *blkg; 911 struct blkio_group *blkg;
247 void *key; 912 void *key;
248 struct blkio_policy_type *blkiop; 913 struct blkio_policy_type *blkiop;
914 struct blkio_policy_node *pn, *pntmp;
249 915
250 rcu_read_lock(); 916 rcu_read_lock();
251remove_entry: 917 do {
252 spin_lock_irqsave(&blkcg->lock, flags); 918 spin_lock_irqsave(&blkcg->lock, flags);
919
920 if (hlist_empty(&blkcg->blkg_list)) {
921 spin_unlock_irqrestore(&blkcg->lock, flags);
922 break;
923 }
924
925 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
926 blkcg_node);
927 key = rcu_dereference(blkg->key);
928 __blkiocg_del_blkio_group(blkg);
253 929
254 if (hlist_empty(&blkcg->blkg_list)) {
255 spin_unlock_irqrestore(&blkcg->lock, flags); 930 spin_unlock_irqrestore(&blkcg->lock, flags);
256 goto done;
257 }
258 931
259 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 932 /*
260 blkcg_node); 933 * This blkio_group is being unlinked as associated cgroup is
261 key = rcu_dereference(blkg->key); 934 * going away. Let all the IO controlling policies know about
262 __blkiocg_del_blkio_group(blkg); 935 * this event. Currently this is static call to one io
936 * controlling policy. Once we have more policies in place, we
937 * need some dynamic registration of callback function.
938 */
939 spin_lock(&blkio_list_lock);
940 list_for_each_entry(blkiop, &blkio_list, list)
941 blkiop->ops.blkio_unlink_group_fn(key, blkg);
942 spin_unlock(&blkio_list_lock);
943 } while (1);
263 944
264 spin_unlock_irqrestore(&blkcg->lock, flags); 945 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
946 blkio_policy_delete_node(pn);
947 kfree(pn);
948 }
265 949
266 /*
267 * This blkio_group is being unlinked as associated cgroup is going
268 * away. Let all the IO controlling policies know about this event.
269 *
270 * Currently this is static call to one io controlling policy. Once
271 * we have more policies in place, we need some dynamic registration
272 * of callback function.
273 */
274 spin_lock(&blkio_list_lock);
275 list_for_each_entry(blkiop, &blkio_list, list)
276 blkiop->ops.blkio_unlink_group_fn(key, blkg);
277 spin_unlock(&blkio_list_lock);
278 goto remove_entry;
279done:
280 free_css_id(&blkio_subsys, &blkcg->css); 950 free_css_id(&blkio_subsys, &blkcg->css);
281 rcu_read_unlock(); 951 rcu_read_unlock();
282 if (blkcg != &blkio_root_cgroup) 952 if (blkcg != &blkio_root_cgroup)
@@ -307,6 +977,7 @@ done:
307 spin_lock_init(&blkcg->lock); 977 spin_lock_init(&blkcg->lock);
308 INIT_HLIST_HEAD(&blkcg->blkg_list); 978 INIT_HLIST_HEAD(&blkcg->blkg_list);
309 979
980 INIT_LIST_HEAD(&blkcg->policy_list);
310 return &blkcg->css; 981 return &blkcg->css;
311} 982}
312 983
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ccc20464dae..2b866ec1dcea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys;
23#define blkio_subsys_id blkio_subsys.subsys_id 23#define blkio_subsys_id blkio_subsys.subsys_id
24#endif 24#endif
25 25
26enum stat_type {
27 /* Total time spent (in ns) between request dispatch to the driver and
28 * request completion for IOs doen by this cgroup. This may not be
29 * accurate when NCQ is turned on. */
30 BLKIO_STAT_SERVICE_TIME = 0,
31 /* Total bytes transferred */
32 BLKIO_STAT_SERVICE_BYTES,
33 /* Total IOs serviced, post merge */
34 BLKIO_STAT_SERVICED,
35 /* Total time spent waiting in scheduler queue in ns */
36 BLKIO_STAT_WAIT_TIME,
37 /* Number of IOs merged */
38 BLKIO_STAT_MERGED,
39 /* Number of IOs queued up */
40 BLKIO_STAT_QUEUED,
41 /* All the single valued stats go below this */
42 BLKIO_STAT_TIME,
43 BLKIO_STAT_SECTORS,
44#ifdef CONFIG_DEBUG_BLK_CGROUP
45 BLKIO_STAT_AVG_QUEUE_SIZE,
46 BLKIO_STAT_IDLE_TIME,
47 BLKIO_STAT_EMPTY_TIME,
48 BLKIO_STAT_GROUP_WAIT_TIME,
49 BLKIO_STAT_DEQUEUE
50#endif
51};
52
53enum stat_sub_type {
54 BLKIO_STAT_READ = 0,
55 BLKIO_STAT_WRITE,
56 BLKIO_STAT_SYNC,
57 BLKIO_STAT_ASYNC,
58 BLKIO_STAT_TOTAL
59};
60
61/* blkg state flags */
62enum blkg_state_flags {
63 BLKG_waiting = 0,
64 BLKG_idling,
65 BLKG_empty,
66};
67
26struct blkio_cgroup { 68struct blkio_cgroup {
27 struct cgroup_subsys_state css; 69 struct cgroup_subsys_state css;
28 unsigned int weight; 70 unsigned int weight;
29 spinlock_t lock; 71 spinlock_t lock;
30 struct hlist_head blkg_list; 72 struct hlist_head blkg_list;
73 struct list_head policy_list; /* list of blkio_policy_node */
74};
75
76struct blkio_group_stats {
77 /* total disk time and nr sectors dispatched by this group */
78 uint64_t time;
79 uint64_t sectors;
80 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
81#ifdef CONFIG_DEBUG_BLK_CGROUP
82 /* Sum of number of IOs queued across all samples */
83 uint64_t avg_queue_size_sum;
84 /* Count of samples taken for average */
85 uint64_t avg_queue_size_samples;
86 /* How many times this group has been removed from service tree */
87 unsigned long dequeue;
88
89 /* Total time spent waiting for it to be assigned a timeslice. */
90 uint64_t group_wait_time;
91 uint64_t start_group_wait_time;
92
93 /* Time spent idling for this blkio_group */
94 uint64_t idle_time;
95 uint64_t start_idle_time;
96 /*
97 * Total time when we have requests queued and do not contain the
98 * current active queue.
99 */
100 uint64_t empty_time;
101 uint64_t start_empty_time;
102 uint16_t flags;
103#endif
31}; 104};
32 105
33struct blkio_group { 106struct blkio_group {
@@ -35,20 +108,25 @@ struct blkio_group {
35 void *key; 108 void *key;
36 struct hlist_node blkcg_node; 109 struct hlist_node blkcg_node;
37 unsigned short blkcg_id; 110 unsigned short blkcg_id;
38#ifdef CONFIG_DEBUG_BLK_CGROUP
39 /* Store cgroup path */ 111 /* Store cgroup path */
40 char path[128]; 112 char path[128];
41 /* How many times this group has been removed from service tree */
42 unsigned long dequeue;
43#endif
44 /* The device MKDEV(major, minor), this group has been created for */ 113 /* The device MKDEV(major, minor), this group has been created for */
45 dev_t dev; 114 dev_t dev;
46 115
47 /* total disk time and nr sectors dispatched by this group */ 116 /* Need to serialize the stats in the case of reset/update */
48 unsigned long time; 117 spinlock_t stats_lock;
49 unsigned long sectors; 118 struct blkio_group_stats stats;
50}; 119};
51 120
121struct blkio_policy_node {
122 struct list_head node;
123 dev_t dev;
124 unsigned int weight;
125};
126
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev);
129
52typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
53typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
54 unsigned int weight); 132 unsigned int weight);
@@ -67,6 +145,11 @@ struct blkio_policy_type {
67extern void blkio_policy_register(struct blkio_policy_type *); 145extern void blkio_policy_register(struct blkio_policy_type *);
68extern void blkio_policy_unregister(struct blkio_policy_type *); 146extern void blkio_policy_unregister(struct blkio_policy_type *);
69 147
148static inline char *blkg_path(struct blkio_group *blkg)
149{
150 return blkg->path;
151}
152
70#else 153#else
71 154
72struct blkio_group { 155struct blkio_group {
@@ -78,6 +161,8 @@ struct blkio_policy_type {
78static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } 161static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
79static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } 162static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
80 163
164static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
165
81#endif 166#endif
82 167
83#define BLKIO_WEIGHT_MIN 100 168#define BLKIO_WEIGHT_MIN 100
@@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
85#define BLKIO_WEIGHT_DEFAULT 500 170#define BLKIO_WEIGHT_DEFAULT 500
86 171
87#ifdef CONFIG_DEBUG_BLK_CGROUP 172#ifdef CONFIG_DEBUG_BLK_CGROUP
88static inline char *blkg_path(struct blkio_group *blkg) 173void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
89{ 174void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
90 return blkg->path;
91}
92void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
93 unsigned long dequeue); 175 unsigned long dequeue);
176void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
177void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
178void blkiocg_set_start_empty_time(struct blkio_group *blkg);
179
180#define BLKG_FLAG_FNS(name) \
181static inline void blkio_mark_blkg_##name( \
182 struct blkio_group_stats *stats) \
183{ \
184 stats->flags |= (1 << BLKG_##name); \
185} \
186static inline void blkio_clear_blkg_##name( \
187 struct blkio_group_stats *stats) \
188{ \
189 stats->flags &= ~(1 << BLKG_##name); \
190} \
191static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
192{ \
193 return (stats->flags & (1 << BLKG_##name)) != 0; \
194} \
195
196BLKG_FLAG_FNS(waiting)
197BLKG_FLAG_FNS(idling)
198BLKG_FLAG_FNS(empty)
199#undef BLKG_FLAG_FNS
94#else 200#else
95static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } 201static inline void blkiocg_update_avg_queue_size_stats(
96static inline void blkiocg_update_blkio_group_dequeue_stats( 202 struct blkio_group *blkg) {}
97 struct blkio_group *blkg, unsigned long dequeue) {} 203static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
204 unsigned long dequeue) {}
205static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
206{}
207static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
208static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
98#endif 209#endif
99 210
100#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 211#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
105extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 216extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
106extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
107 void *key); 218 void *key);
108void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 219void blkiocg_update_timeslice_used(struct blkio_group *blkg,
109 unsigned long time, unsigned long sectors); 220 unsigned long time);
221void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
222 bool direction, bool sync);
223void blkiocg_update_completion_stats(struct blkio_group *blkg,
224 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
225void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
226 bool sync);
227void blkiocg_update_io_add_stats(struct blkio_group *blkg,
228 struct blkio_group *curr_blkg, bool direction, bool sync);
229void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
230 bool direction, bool sync);
110#else 231#else
111struct cgroup; 232struct cgroup;
112static inline struct blkio_cgroup * 233static inline struct blkio_cgroup *
113cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
114 235
115static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
116 struct blkio_group *blkg, void *key, dev_t dev) 237 struct blkio_group *blkg, void *key, dev_t dev) {}
117{
118}
119 238
120static inline int 239static inline int
121blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
122 241
123static inline struct blkio_group * 242static inline struct blkio_group *
124blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 243blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
125static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 244static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
126 unsigned long time, unsigned long sectors) 245 unsigned long time) {}
127{ 246static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
128} 247 uint64_t bytes, bool direction, bool sync) {}
248static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
249 uint64_t start_time, uint64_t io_start_time, bool direction,
250 bool sync) {}
251static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
252 bool direction, bool sync) {}
253static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
254 struct blkio_group *curr_blkg, bool direction, bool sync) {}
255static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
256 bool direction, bool sync) {}
129#endif 257#endif
130#endif /* _BLK_CGROUP_H */ 258#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 9fe174dc74d1..3bc5579d6f54 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
127 rq->tag = -1; 127 rq->tag = -1;
128 rq->ref_count = 1; 128 rq->ref_count = 1;
129 rq->start_time = jiffies; 129 rq->start_time = jiffies;
130 set_start_time_ns(rq);
130} 131}
131EXPORT_SYMBOL(blk_rq_init); 132EXPORT_SYMBOL(blk_rq_init);
132 133
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
450 */ 451 */
451 blk_sync_queue(q); 452 blk_sync_queue(q);
452 453
454 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
453 mutex_lock(&q->sysfs_lock); 455 mutex_lock(&q->sysfs_lock);
454 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); 456 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
455 mutex_unlock(&q->sysfs_lock); 457 mutex_unlock(&q->sysfs_lock);
@@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
510 return NULL; 512 return NULL;
511 } 513 }
512 514
515 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
516 laptop_mode_timer_fn, (unsigned long) q);
513 init_timer(&q->unplug_timer); 517 init_timer(&q->unplug_timer);
514 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 518 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
515 INIT_LIST_HEAD(&q->timeout_list); 519 INIT_LIST_HEAD(&q->timeout_list);
@@ -568,6 +572,22 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
568{ 572{
569 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 573 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
570 574
575 return blk_init_allocated_queue_node(q, rfn, lock, node_id);
576}
577EXPORT_SYMBOL(blk_init_queue_node);
578
579struct request_queue *
580blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
581 spinlock_t *lock)
582{
583 return blk_init_allocated_queue_node(q, rfn, lock, -1);
584}
585EXPORT_SYMBOL(blk_init_allocated_queue);
586
587struct request_queue *
588blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
589 spinlock_t *lock, int node_id)
590{
571 if (!q) 591 if (!q)
572 return NULL; 592 return NULL;
573 593
@@ -601,7 +621,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
601 blk_put_queue(q); 621 blk_put_queue(q);
602 return NULL; 622 return NULL;
603} 623}
604EXPORT_SYMBOL(blk_init_queue_node); 624EXPORT_SYMBOL(blk_init_allocated_queue_node);
605 625
606int blk_get_queue(struct request_queue *q) 626int blk_get_queue(struct request_queue *q)
607{ 627{
@@ -1198,6 +1218,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1198 if (!blk_rq_cpu_valid(req)) 1218 if (!blk_rq_cpu_valid(req))
1199 req->cpu = bio->bi_comp_cpu; 1219 req->cpu = bio->bi_comp_cpu;
1200 drive_stat_acct(req, 0); 1220 drive_stat_acct(req, 0);
1221 elv_bio_merged(q, req, bio);
1201 if (!attempt_back_merge(q, req)) 1222 if (!attempt_back_merge(q, req))
1202 elv_merged_request(q, req, el_ret); 1223 elv_merged_request(q, req, el_ret);
1203 goto out; 1224 goto out;
@@ -1231,6 +1252,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1231 if (!blk_rq_cpu_valid(req)) 1252 if (!blk_rq_cpu_valid(req))
1232 req->cpu = bio->bi_comp_cpu; 1253 req->cpu = bio->bi_comp_cpu;
1233 drive_stat_acct(req, 0); 1254 drive_stat_acct(req, 0);
1255 elv_bio_merged(q, req, bio);
1234 if (!attempt_front_merge(q, req)) 1256 if (!attempt_front_merge(q, req))
1235 elv_merged_request(q, req, el_ret); 1257 elv_merged_request(q, req, el_ret);
1236 goto out; 1258 goto out;
@@ -1855,8 +1877,10 @@ void blk_dequeue_request(struct request *rq)
1855 * and to it is freed is accounted as io that is in progress at 1877 * and to it is freed is accounted as io that is in progress at
1856 * the driver side. 1878 * the driver side.
1857 */ 1879 */
1858 if (blk_account_rq(rq)) 1880 if (blk_account_rq(rq)) {
1859 q->in_flight[rq_is_sync(rq)]++; 1881 q->in_flight[rq_is_sync(rq)]++;
1882 set_io_start_time_ns(rq);
1883 }
1860} 1884}
1861 1885
1862/** 1886/**
@@ -2098,7 +2122,7 @@ static void blk_finish_request(struct request *req, int error)
2098 BUG_ON(blk_queued_rq(req)); 2122 BUG_ON(blk_queued_rq(req));
2099 2123
2100 if (unlikely(laptop_mode) && blk_fs_request(req)) 2124 if (unlikely(laptop_mode) && blk_fs_request(req))
2101 laptop_io_completion(); 2125 laptop_io_completion(&req->q->backing_dev_info);
2102 2126
2103 blk_delete_timer(req); 2127 blk_delete_timer(req);
2104 2128
@@ -2517,4 +2541,3 @@ int __init blk_dev_init(void)
2517 2541
2518 return 0; 2542 return 0;
2519} 2543}
2520
diff --git a/block/blk-lib.c b/block/blk-lib.c
new file mode 100644
index 000000000000..d0216b9f22d4
--- /dev/null
+++ b/block/blk-lib.c
@@ -0,0 +1,233 @@
1/*
2 * Functions related to generic helpers functions
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/scatterlist.h>
9
10#include "blk.h"
11
12static void blkdev_discard_end_io(struct bio *bio, int err)
13{
14 if (err) {
15 if (err == -EOPNOTSUPP)
16 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
17 clear_bit(BIO_UPTODATE, &bio->bi_flags);
18 }
19
20 if (bio->bi_private)
21 complete(bio->bi_private);
22 __free_page(bio_page(bio));
23
24 bio_put(bio);
25}
26
27/**
28 * blkdev_issue_discard - queue a discard
29 * @bdev: blockdev to issue discard for
30 * @sector: start sector
31 * @nr_sects: number of sectors to discard
32 * @gfp_mask: memory allocation flags (for bio_alloc)
33 * @flags: BLKDEV_IFL_* flags to control behaviour
34 *
35 * Description:
36 * Issue a discard request for the sectors in question.
37 */
38int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
39 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
40{
41 DECLARE_COMPLETION_ONSTACK(wait);
42 struct request_queue *q = bdev_get_queue(bdev);
43 int type = flags & BLKDEV_IFL_BARRIER ?
44 DISCARD_BARRIER : DISCARD_NOBARRIER;
45 struct bio *bio;
46 struct page *page;
47 int ret = 0;
48
49 if (!q)
50 return -ENXIO;
51
52 if (!blk_queue_discard(q))
53 return -EOPNOTSUPP;
54
55 while (nr_sects && !ret) {
56 unsigned int sector_size = q->limits.logical_block_size;
57 unsigned int max_discard_sectors =
58 min(q->limits.max_discard_sectors, UINT_MAX >> 9);
59
60 bio = bio_alloc(gfp_mask, 1);
61 if (!bio)
62 goto out;
63 bio->bi_sector = sector;
64 bio->bi_end_io = blkdev_discard_end_io;
65 bio->bi_bdev = bdev;
66 if (flags & BLKDEV_IFL_WAIT)
67 bio->bi_private = &wait;
68
69 /*
70 * Add a zeroed one-sector payload as that's what
71 * our current implementations need. If we'll ever need
72 * more the interface will need revisiting.
73 */
74 page = alloc_page(gfp_mask | __GFP_ZERO);
75 if (!page)
76 goto out_free_bio;
77 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
78 goto out_free_page;
79
80 /*
81 * And override the bio size - the way discard works we
82 * touch many more blocks on disk than the actual payload
83 * length.
84 */
85 if (nr_sects > max_discard_sectors) {
86 bio->bi_size = max_discard_sectors << 9;
87 nr_sects -= max_discard_sectors;
88 sector += max_discard_sectors;
89 } else {
90 bio->bi_size = nr_sects << 9;
91 nr_sects = 0;
92 }
93
94 bio_get(bio);
95 submit_bio(type, bio);
96
97 if (flags & BLKDEV_IFL_WAIT)
98 wait_for_completion(&wait);
99
100 if (bio_flagged(bio, BIO_EOPNOTSUPP))
101 ret = -EOPNOTSUPP;
102 else if (!bio_flagged(bio, BIO_UPTODATE))
103 ret = -EIO;
104 bio_put(bio);
105 }
106 return ret;
107out_free_page:
108 __free_page(page);
109out_free_bio:
110 bio_put(bio);
111out:
112 return -ENOMEM;
113}
114EXPORT_SYMBOL(blkdev_issue_discard);
115
116struct bio_batch
117{
118 atomic_t done;
119 unsigned long flags;
120 struct completion *wait;
121 bio_end_io_t *end_io;
122};
123
124static void bio_batch_end_io(struct bio *bio, int err)
125{
126 struct bio_batch *bb = bio->bi_private;
127
128 if (err) {
129 if (err == -EOPNOTSUPP)
130 set_bit(BIO_EOPNOTSUPP, &bb->flags);
131 else
132 clear_bit(BIO_UPTODATE, &bb->flags);
133 }
134 if (bb) {
135 if (bb->end_io)
136 bb->end_io(bio, err);
137 atomic_inc(&bb->done);
138 complete(bb->wait);
139 }
140 bio_put(bio);
141}
142
143/**
144 * blkdev_issue_zeroout generate number of zero filed write bios
145 * @bdev: blockdev to issue
146 * @sector: start sector
147 * @nr_sects: number of sectors to write
148 * @gfp_mask: memory allocation flags (for bio_alloc)
149 * @flags: BLKDEV_IFL_* flags to control behaviour
150 *
151 * Description:
152 * Generate and issue number of bios with zerofiled pages.
153 * Send barrier at the beginning and at the end if requested. This guarantie
154 * correct request ordering. Empty barrier allow us to avoid post queue flush.
155 */
156
157int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
158 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
159{
160 int ret = 0;
161 struct bio *bio;
162 struct bio_batch bb;
163 unsigned int sz, issued = 0;
164 DECLARE_COMPLETION_ONSTACK(wait);
165
166 atomic_set(&bb.done, 0);
167 bb.flags = 1 << BIO_UPTODATE;
168 bb.wait = &wait;
169 bb.end_io = NULL;
170
171 if (flags & BLKDEV_IFL_BARRIER) {
172 /* issue async barrier before the data */
173 ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
174 if (ret)
175 return ret;
176 }
177submit:
178 while (nr_sects != 0) {
179 bio = bio_alloc(gfp_mask,
180 min(nr_sects, (sector_t)BIO_MAX_PAGES));
181 if (!bio)
182 break;
183
184 bio->bi_sector = sector;
185 bio->bi_bdev = bdev;
186 bio->bi_end_io = bio_batch_end_io;
187 if (flags & BLKDEV_IFL_WAIT)
188 bio->bi_private = &bb;
189
190 while (nr_sects != 0) {
191 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
192 if (sz == 0)
193 /* bio has maximum size possible */
194 break;
195 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
196 nr_sects -= ret >> 9;
197 sector += ret >> 9;
198 if (ret < (sz << 9))
199 break;
200 }
201 issued++;
202 submit_bio(WRITE, bio);
203 }
204 /*
205 * When all data bios are in flight. Send final barrier if requeted.
206 */
207 if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
208 ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
209 flags & BLKDEV_IFL_WAIT);
210
211
212 if (flags & BLKDEV_IFL_WAIT)
213 /* Wait for bios in-flight */
214 while ( issued != atomic_read(&bb.done))
215 wait_for_completion(&wait);
216
217 if (!test_bit(BIO_UPTODATE, &bb.flags))
218 /* One of bios in the batch was completed with error.*/
219 ret = -EIO;
220
221 if (ret)
222 goto out;
223
224 if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
225 ret = -EOPNOTSUPP;
226 goto out;
227 }
228 if (nr_sects != 0)
229 goto submit;
230out:
231 return ret;
232}
233EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5f127cfb2e92..ed897b5ef315 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
55#define RQ_CIC(rq) \ 55#define RQ_CIC(rq) \
56 ((struct cfq_io_context *) (rq)->elevator_private) 56 ((struct cfq_io_context *) (rq)->elevator_private)
57#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 57#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
58#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
58 59
59static struct kmem_cache *cfq_pool; 60static struct kmem_cache *cfq_pool;
60static struct kmem_cache *cfq_ioc_pool; 61static struct kmem_cache *cfq_ioc_pool;
@@ -143,8 +144,6 @@ struct cfq_queue {
143 struct cfq_queue *new_cfqq; 144 struct cfq_queue *new_cfqq;
144 struct cfq_group *cfqg; 145 struct cfq_group *cfqg;
145 struct cfq_group *orig_cfqg; 146 struct cfq_group *orig_cfqg;
146 /* Sectors dispatched in current dispatch round */
147 unsigned long nr_sectors;
148}; 147};
149 148
150/* 149/*
@@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep);
346CFQ_CFQQ_FNS(wait_busy); 345CFQ_CFQQ_FNS(wait_busy);
347#undef CFQ_CFQQ_FNS 346#undef CFQ_CFQQ_FNS
348 347
349#ifdef CONFIG_DEBUG_CFQ_IOSCHED 348#ifdef CONFIG_CFQ_GROUP_IOSCHED
350#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 349#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
351 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 350 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
352 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 351 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
858 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 857 if (!RB_EMPTY_NODE(&cfqg->rb_node))
859 cfq_rb_erase(&cfqg->rb_node, st); 858 cfq_rb_erase(&cfqg->rb_node, st);
860 cfqg->saved_workload_slice = 0; 859 cfqg->saved_workload_slice = 0;
861 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); 860 blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
862} 861}
863 862
864static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 863static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
884 slice_used = cfqq->allocated_slice; 883 slice_used = cfqq->allocated_slice;
885 } 884 }
886 885
887 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, 886 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
888 cfqq->nr_sectors);
889 return slice_used; 887 return slice_used;
890} 888}
891 889
@@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
919 917
920 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 918 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
921 st->min_vdisktime); 919 st->min_vdisktime);
922 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, 920 blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
923 cfqq->nr_sectors); 921 blkiocg_set_start_empty_time(&cfqg->blkg);
924} 922}
925 923
926#ifdef CONFIG_CFQ_GROUP_IOSCHED 924#ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
961 if (!cfqg) 959 if (!cfqg)
962 goto done; 960 goto done;
963 961
964 cfqg->weight = blkcg->weight;
965 for_each_cfqg_st(cfqg, i, j, st) 962 for_each_cfqg_st(cfqg, i, j, st)
966 *st = CFQ_RB_ROOT; 963 *st = CFQ_RB_ROOT;
967 RB_CLEAR_NODE(&cfqg->rb_node); 964 RB_CLEAR_NODE(&cfqg->rb_node);
@@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
978 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 975 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
979 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 976 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
980 MKDEV(major, minor)); 977 MKDEV(major, minor));
978 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
981 979
982 /* Add group on cfqd list */ 980 /* Add group on cfqd list */
983 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 981 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1004 return cfqg; 1002 return cfqg;
1005} 1003}
1006 1004
1005static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1006{
1007 atomic_inc(&cfqg->ref);
1008 return cfqg;
1009}
1010
1007static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1011static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1008{ 1012{
1009 /* Currently, all async queues are mapped to root group */ 1013 /* Currently, all async queues are mapped to root group */
@@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1087{ 1091{
1088 return &cfqd->root_group; 1092 return &cfqd->root_group;
1089} 1093}
1094
1095static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1096{
1097 return cfqg;
1098}
1099
1090static inline void 1100static inline void
1091cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1101cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1092 cfqq->cfqg = cfqg; 1102 cfqq->cfqg = cfqg;
@@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1389{ 1399{
1390 elv_rb_del(&cfqq->sort_list, rq); 1400 elv_rb_del(&cfqq->sort_list, rq);
1391 cfqq->queued[rq_is_sync(rq)]--; 1401 cfqq->queued[rq_is_sync(rq)]--;
1402 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1403 rq_is_sync(rq));
1392 cfq_add_rq_rb(rq); 1404 cfq_add_rq_rb(rq);
1405 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1406 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1407 rq_is_sync(rq));
1393} 1408}
1394 1409
1395static struct request * 1410static struct request *
@@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq)
1445 cfq_del_rq_rb(rq); 1460 cfq_del_rq_rb(rq);
1446 1461
1447 cfqq->cfqd->rq_queued--; 1462 cfqq->cfqd->rq_queued--;
1463 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1464 rq_is_sync(rq));
1448 if (rq_is_meta(rq)) { 1465 if (rq_is_meta(rq)) {
1449 WARN_ON(!cfqq->meta_pending); 1466 WARN_ON(!cfqq->meta_pending);
1450 cfqq->meta_pending--; 1467 cfqq->meta_pending--;
@@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
1476 } 1493 }
1477} 1494}
1478 1495
1496static void cfq_bio_merged(struct request_queue *q, struct request *req,
1497 struct bio *bio)
1498{
1499 blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
1500 cfq_bio_sync(bio));
1501}
1502
1479static void 1503static void
1480cfq_merged_requests(struct request_queue *q, struct request *rq, 1504cfq_merged_requests(struct request_queue *q, struct request *rq,
1481 struct request *next) 1505 struct request *next)
@@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1493 if (cfqq->next_rq == next) 1517 if (cfqq->next_rq == next)
1494 cfqq->next_rq = rq; 1518 cfqq->next_rq = rq;
1495 cfq_remove_request(next); 1519 cfq_remove_request(next);
1520 blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
1521 rq_is_sync(next));
1496} 1522}
1497 1523
1498static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1524static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1520 return cfqq == RQ_CFQQ(rq); 1546 return cfqq == RQ_CFQQ(rq);
1521} 1547}
1522 1548
1549static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1550{
1551 del_timer(&cfqd->idle_slice_timer);
1552 blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1553}
1554
1523static void __cfq_set_active_queue(struct cfq_data *cfqd, 1555static void __cfq_set_active_queue(struct cfq_data *cfqd,
1524 struct cfq_queue *cfqq) 1556 struct cfq_queue *cfqq)
1525{ 1557{
1526 if (cfqq) { 1558 if (cfqq) {
1527 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1559 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1528 cfqd->serving_prio, cfqd->serving_type); 1560 cfqd->serving_prio, cfqd->serving_type);
1561 blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1529 cfqq->slice_start = 0; 1562 cfqq->slice_start = 0;
1530 cfqq->dispatch_start = jiffies; 1563 cfqq->dispatch_start = jiffies;
1531 cfqq->allocated_slice = 0; 1564 cfqq->allocated_slice = 0;
1532 cfqq->slice_end = 0; 1565 cfqq->slice_end = 0;
1533 cfqq->slice_dispatch = 0; 1566 cfqq->slice_dispatch = 0;
1534 cfqq->nr_sectors = 0;
1535 1567
1536 cfq_clear_cfqq_wait_request(cfqq); 1568 cfq_clear_cfqq_wait_request(cfqq);
1537 cfq_clear_cfqq_must_dispatch(cfqq); 1569 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1539 cfq_clear_cfqq_fifo_expire(cfqq); 1571 cfq_clear_cfqq_fifo_expire(cfqq);
1540 cfq_mark_cfqq_slice_new(cfqq); 1572 cfq_mark_cfqq_slice_new(cfqq);
1541 1573
1542 del_timer(&cfqd->idle_slice_timer); 1574 cfq_del_timer(cfqd, cfqq);
1543 } 1575 }
1544 1576
1545 cfqd->active_queue = cfqq; 1577 cfqd->active_queue = cfqq;
@@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1555 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1587 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1556 1588
1557 if (cfq_cfqq_wait_request(cfqq)) 1589 if (cfq_cfqq_wait_request(cfqq))
1558 del_timer(&cfqd->idle_slice_timer); 1590 cfq_del_timer(cfqd, cfqq);
1559 1591
1560 cfq_clear_cfqq_wait_request(cfqq); 1592 cfq_clear_cfqq_wait_request(cfqq);
1561 cfq_clear_cfqq_wait_busy(cfqq); 1593 cfq_clear_cfqq_wait_busy(cfqq);
@@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1857 sl = cfqd->cfq_slice_idle; 1889 sl = cfqd->cfq_slice_idle;
1858 1890
1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1891 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1892 blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1893 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1861} 1894}
1862 1895
@@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1876 elv_dispatch_sort(q, rq); 1909 elv_dispatch_sort(q, rq);
1877 1910
1878 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1911 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1879 cfqq->nr_sectors += blk_rq_sectors(rq); 1912 blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1913 rq_data_dir(rq), rq_is_sync(rq));
1880} 1914}
1881 1915
1882/* 1916/*
@@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3185 if (cfq_cfqq_wait_request(cfqq)) { 3219 if (cfq_cfqq_wait_request(cfqq)) {
3186 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3220 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3187 cfqd->busy_queues > 1) { 3221 cfqd->busy_queues > 1) {
3188 del_timer(&cfqd->idle_slice_timer); 3222 cfq_del_timer(cfqd, cfqq);
3189 cfq_clear_cfqq_wait_request(cfqq); 3223 cfq_clear_cfqq_wait_request(cfqq);
3190 __blk_run_queue(cfqd->queue); 3224 __blk_run_queue(cfqd->queue);
3191 } else 3225 } else {
3226 blkiocg_update_idle_time_stats(
3227 &cfqq->cfqg->blkg);
3192 cfq_mark_cfqq_must_dispatch(cfqq); 3228 cfq_mark_cfqq_must_dispatch(cfqq);
3229 }
3193 } 3230 }
3194 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3231 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3195 /* 3232 /*
@@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
3214 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3251 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3215 list_add_tail(&rq->queuelist, &cfqq->fifo); 3252 list_add_tail(&rq->queuelist, &cfqq->fifo);
3216 cfq_add_rq_rb(rq); 3253 cfq_add_rq_rb(rq);
3217 3254 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3255 &cfqd->serving_group->blkg, rq_data_dir(rq),
3256 rq_is_sync(rq));
3218 cfq_rq_enqueued(cfqd, cfqq, rq); 3257 cfq_rq_enqueued(cfqd, cfqq, rq);
3219} 3258}
3220 3259
@@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3300 WARN_ON(!cfqq->dispatched); 3339 WARN_ON(!cfqq->dispatched);
3301 cfqd->rq_in_driver--; 3340 cfqd->rq_in_driver--;
3302 cfqq->dispatched--; 3341 cfqq->dispatched--;
3342 blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
3343 rq_io_start_time_ns(rq), rq_data_dir(rq),
3344 rq_is_sync(rq));
3303 3345
3304 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3346 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3305 3347
@@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq)
3440 rq->elevator_private = NULL; 3482 rq->elevator_private = NULL;
3441 rq->elevator_private2 = NULL; 3483 rq->elevator_private2 = NULL;
3442 3484
3485 /* Put down rq reference on cfqg */
3486 cfq_put_cfqg(RQ_CFQG(rq));
3487 rq->elevator_private3 = NULL;
3488
3443 cfq_put_queue(cfqq); 3489 cfq_put_queue(cfqq);
3444 } 3490 }
3445} 3491}
@@ -3528,6 +3574,7 @@ new_queue:
3528 3574
3529 rq->elevator_private = cic; 3575 rq->elevator_private = cic;
3530 rq->elevator_private2 = cfqq; 3576 rq->elevator_private2 = cfqq;
3577 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3531 return 0; 3578 return 0;
3532 3579
3533queue_fail: 3580queue_fail:
@@ -3743,7 +3790,6 @@ static void *cfq_init_queue(struct request_queue *q)
3743 * second, in order to have larger depth for async operations. 3790 * second, in order to have larger depth for async operations.
3744 */ 3791 */
3745 cfqd->last_delayed_sync = jiffies - HZ; 3792 cfqd->last_delayed_sync = jiffies - HZ;
3746 INIT_RCU_HEAD(&cfqd->rcu);
3747 return cfqd; 3793 return cfqd;
3748} 3794}
3749 3795
@@ -3872,6 +3918,7 @@ static struct elevator_type iosched_cfq = {
3872 .elevator_merged_fn = cfq_merged_request, 3918 .elevator_merged_fn = cfq_merged_request,
3873 .elevator_merge_req_fn = cfq_merged_requests, 3919 .elevator_merge_req_fn = cfq_merged_requests,
3874 .elevator_allow_merge_fn = cfq_allow_merge, 3920 .elevator_allow_merge_fn = cfq_allow_merge,
3921 .elevator_bio_merged_fn = cfq_bio_merged,
3875 .elevator_dispatch_fn = cfq_dispatch_requests, 3922 .elevator_dispatch_fn = cfq_dispatch_requests,
3876 .elevator_add_req_fn = cfq_insert_request, 3923 .elevator_add_req_fn = cfq_insert_request,
3877 .elevator_activate_req_fn = cfq_activate_request, 3924 .elevator_activate_req_fn = cfq_activate_request,
diff --git a/block/elevator.c b/block/elevator.c
index 76e3702d5381..6df2b5056b51 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
539 q->last_merge = rq; 539 q->last_merge = rq;
540} 540}
541 541
542void elv_bio_merged(struct request_queue *q, struct request *rq,
543 struct bio *bio)
544{
545 struct elevator_queue *e = q->elevator;
546
547 if (e->ops->elevator_bio_merged_fn)
548 e->ops->elevator_bio_merged_fn(q, rq, bio);
549}
550
542void elv_requeue_request(struct request_queue *q, struct request *rq) 551void elv_requeue_request(struct request_queue *q, struct request *rq)
543{ 552{
544 /* 553 /*
@@ -921,6 +930,7 @@ int elv_register_queue(struct request_queue *q)
921 } 930 }
922 return error; 931 return error;
923} 932}
933EXPORT_SYMBOL(elv_register_queue);
924 934
925static void __elv_unregister_queue(struct elevator_queue *e) 935static void __elv_unregister_queue(struct elevator_queue *e)
926{ 936{
@@ -933,6 +943,7 @@ void elv_unregister_queue(struct request_queue *q)
933 if (q) 943 if (q)
934 __elv_unregister_queue(q->elevator); 944 __elv_unregister_queue(q->elevator);
935} 945}
946EXPORT_SYMBOL(elv_unregister_queue);
936 947
937void elv_register(struct elevator_type *e) 948void elv_register(struct elevator_type *e)
938{ 949{
diff --git a/block/genhd.c b/block/genhd.c
index d13ba76a169c..59a2db6fecef 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
596 596
597 return disk; 597 return disk;
598} 598}
599EXPORT_SYMBOL(get_gendisk);
599 600
600/** 601/**
601 * bdget_disk - do bdget() by gendisk and partition number 602 * bdget_disk - do bdget() by gendisk and partition number
@@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
987 if (!new_ptbl) 988 if (!new_ptbl)
988 return -ENOMEM; 989 return -ENOMEM;
989 990
990 INIT_RCU_HEAD(&new_ptbl->rcu_head);
991 new_ptbl->len = target; 991 new_ptbl->len = target;
992 992
993 for (i = 0; i < len; i++) 993 for (i = 0; i < len; i++)
diff --git a/block/ioctl.c b/block/ioctl.c
index 8905d2a2a717..e8eb679f2f9b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
126 if (start + len > (bdev->bd_inode->i_size >> 9)) 126 if (start + len > (bdev->bd_inode->i_size >> 9))
127 return -EINVAL; 127 return -EINVAL;
128 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, 128 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
129 DISCARD_FL_WAIT); 129 BLKDEV_IFL_WAIT);
130} 130}
131 131
132static int put_ushort(unsigned long arg, unsigned short val) 132static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 77bfce52e9ca..de277689da61 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -76,6 +76,17 @@ config BLK_DEV_XD
76 76
77 It's pretty unlikely that you have one of these: say N. 77 It's pretty unlikely that you have one of these: say N.
78 78
79config GDROM
80 tristate "SEGA Dreamcast GD-ROM drive"
81 depends on SH_DREAMCAST
82 help
83 A standard SEGA Dreamcast comes with a modified CD ROM drive called a
84 "GD-ROM" by SEGA to signify it is capable of reading special disks
85 with up to 1 GB of data. This drive will also read standard CD ROM
86 disks. Select this option to access any disks in your GD ROM drive.
87 Most users will want to say "Y" here.
88 You can also build this as a module which will be called gdrom.
89
79config PARIDE 90config PARIDE
80 tristate "Parallel port IDE device support" 91 tristate "Parallel port IDE device support"
81 depends on PARPORT_PC 92 depends on PARPORT_PC
@@ -103,17 +114,6 @@ config PARIDE
103 "MicroSolutions backpack protocol", "DataStor Commuter protocol" 114 "MicroSolutions backpack protocol", "DataStor Commuter protocol"
104 etc.). 115 etc.).
105 116
106config GDROM
107 tristate "SEGA Dreamcast GD-ROM drive"
108 depends on SH_DREAMCAST
109 help
110 A standard SEGA Dreamcast comes with a modified CD ROM drive called a
111 "GD-ROM" by SEGA to signify it is capable of reading special disks
112 with up to 1 GB of data. This drive will also read standard CD ROM
113 disks. Select this option to access any disks in your GD ROM drive.
114 Most users will want to say "Y" here.
115 You can also build this as a module which will be called gdrom.
116
117source "drivers/block/paride/Kconfig" 117source "drivers/block/paride/Kconfig"
118 118
119config BLK_CPQ_DA 119config BLK_CPQ_DA
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 3390716898d5..e3f88d6e1412 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -84,6 +84,9 @@ struct drbd_bitmap {
84#define BM_MD_IO_ERROR 1 84#define BM_MD_IO_ERROR 1
85#define BM_P_VMALLOCED 2 85#define BM_P_VMALLOCED 2
86 86
87static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
88 unsigned long e, int val, const enum km_type km);
89
87static int bm_is_locked(struct drbd_bitmap *b) 90static int bm_is_locked(struct drbd_bitmap *b)
88{ 91{
89 return test_bit(BM_LOCKED, &b->bm_flags); 92 return test_bit(BM_LOCKED, &b->bm_flags);
@@ -441,7 +444,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
441 * In case this is actually a resize, we copy the old bitmap into the new one. 444 * In case this is actually a resize, we copy the old bitmap into the new one.
442 * Otherwise, the bitmap is initialized to all bits set. 445 * Otherwise, the bitmap is initialized to all bits set.
443 */ 446 */
444int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) 447int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
445{ 448{
446 struct drbd_bitmap *b = mdev->bitmap; 449 struct drbd_bitmap *b = mdev->bitmap;
447 unsigned long bits, words, owords, obits, *p_addr, *bm; 450 unsigned long bits, words, owords, obits, *p_addr, *bm;
@@ -516,7 +519,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
516 obits = b->bm_bits; 519 obits = b->bm_bits;
517 520
518 growing = bits > obits; 521 growing = bits > obits;
519 if (opages) 522 if (opages && growing && set_new_bits)
520 bm_set_surplus(b); 523 bm_set_surplus(b);
521 524
522 b->bm_pages = npages; 525 b->bm_pages = npages;
@@ -526,8 +529,12 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
526 b->bm_dev_capacity = capacity; 529 b->bm_dev_capacity = capacity;
527 530
528 if (growing) { 531 if (growing) {
529 bm_memset(b, owords, 0xff, words-owords); 532 if (set_new_bits) {
530 b->bm_set += bits - obits; 533 bm_memset(b, owords, 0xff, words-owords);
534 b->bm_set += bits - obits;
535 } else
536 bm_memset(b, owords, 0x00, words-owords);
537
531 } 538 }
532 539
533 if (want < have) { 540 if (want < have) {
@@ -773,7 +780,7 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
773 /* nothing to do, on disk == in memory */ 780 /* nothing to do, on disk == in memory */
774# define bm_cpu_to_lel(x) ((void)0) 781# define bm_cpu_to_lel(x) ((void)0)
775# else 782# else
776void bm_cpu_to_lel(struct drbd_bitmap *b) 783static void bm_cpu_to_lel(struct drbd_bitmap *b)
777{ 784{
778 /* need to cpu_to_lel all the pages ... 785 /* need to cpu_to_lel all the pages ...
779 * this may be optimized by using 786 * this may be optimized by using
@@ -1015,7 +1022,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f
1015 * wants bitnr, not sector. 1022 * wants bitnr, not sector.
1016 * expected to be called for only a few bits (e - s about BITS_PER_LONG). 1023 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1017 * Must hold bitmap lock already. */ 1024 * Must hold bitmap lock already. */
1018int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 1025static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1019 unsigned long e, int val, const enum km_type km) 1026 unsigned long e, int val, const enum km_type km)
1020{ 1027{
1021 struct drbd_bitmap *b = mdev->bitmap; 1028 struct drbd_bitmap *b = mdev->bitmap;
@@ -1053,7 +1060,7 @@ int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1053 * for val != 0, we change 0 -> 1, return code positive 1060 * for val != 0, we change 0 -> 1, return code positive
1054 * for val == 0, we change 1 -> 0, return code negative 1061 * for val == 0, we change 1 -> 0, return code negative
1055 * wants bitnr, not sector */ 1062 * wants bitnr, not sector */
1056int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 1063static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1057 const unsigned long e, int val) 1064 const unsigned long e, int val)
1058{ 1065{
1059 unsigned long flags; 1066 unsigned long flags;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e5e86a781820..e9654c8d5b62 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -132,6 +132,7 @@ enum {
132 DRBD_FAULT_DT_RA = 6, /* data read ahead */ 132 DRBD_FAULT_DT_RA = 6, /* data read ahead */
133 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ 133 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
134 DRBD_FAULT_AL_EE = 8, /* alloc ee */ 134 DRBD_FAULT_AL_EE = 8, /* alloc ee */
135 DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
135 136
136 DRBD_FAULT_MAX, 137 DRBD_FAULT_MAX,
137}; 138};
@@ -208,8 +209,11 @@ enum drbd_packets {
208 P_RS_IS_IN_SYNC = 0x22, /* meta socket */ 209 P_RS_IS_IN_SYNC = 0x22, /* meta socket */
209 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ 210 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
210 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ 211 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
212 /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
213 /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
214 P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
211 215
212 P_MAX_CMD = 0x25, 216 P_MAX_CMD = 0x28,
213 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 217 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
214 P_MAX_OPT_CMD = 0x101, 218 P_MAX_OPT_CMD = 0x101,
215 219
@@ -264,6 +268,7 @@ static inline const char *cmdname(enum drbd_packets cmd)
264 [P_CSUM_RS_REQUEST] = "CsumRSRequest", 268 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
265 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", 269 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
266 [P_COMPRESSED_BITMAP] = "CBitmap", 270 [P_COMPRESSED_BITMAP] = "CBitmap",
271 [P_DELAY_PROBE] = "DelayProbe",
267 [P_MAX_CMD] = NULL, 272 [P_MAX_CMD] = NULL,
268 }; 273 };
269 274
@@ -481,7 +486,8 @@ struct p_sizes {
481 u64 u_size; /* user requested size */ 486 u64 u_size; /* user requested size */
482 u64 c_size; /* current exported size */ 487 u64 c_size; /* current exported size */
483 u32 max_segment_size; /* Maximal size of a BIO */ 488 u32 max_segment_size; /* Maximal size of a BIO */
484 u32 queue_order_type; 489 u16 queue_order_type; /* not yet implemented in DRBD*/
490 u16 dds_flags; /* use enum dds_flags here. */
485} __packed; 491} __packed;
486 492
487struct p_state { 493struct p_state {
@@ -538,6 +544,18 @@ struct p_compressed_bm {
538 u8 code[0]; 544 u8 code[0];
539} __packed; 545} __packed;
540 546
547struct p_delay_probe {
548 struct p_header head;
549 u32 seq_num; /* sequence number to match the two probe packets */
550 u32 offset; /* usecs the probe got sent after the reference time point */
551} __packed;
552
553struct delay_probe {
554 struct list_head list;
555 unsigned int seq_num;
556 struct timeval time;
557};
558
541/* DCBP: Drbd Compressed Bitmap Packet ... */ 559/* DCBP: Drbd Compressed Bitmap Packet ... */
542static inline enum drbd_bitmap_code 560static inline enum drbd_bitmap_code
543DCBP_get_code(struct p_compressed_bm *p) 561DCBP_get_code(struct p_compressed_bm *p)
@@ -722,22 +740,6 @@ enum epoch_event {
722 EV_CLEANUP = 32, /* used as flag */ 740 EV_CLEANUP = 32, /* used as flag */
723}; 741};
724 742
725struct drbd_epoch_entry {
726 struct drbd_work w;
727 struct drbd_conf *mdev;
728 struct bio *private_bio;
729 struct hlist_node colision;
730 sector_t sector;
731 unsigned int size;
732 struct drbd_epoch *epoch;
733
734 /* up to here, the struct layout is identical to drbd_request;
735 * we might be able to use that to our advantage... */
736
737 unsigned int flags;
738 u64 block_id;
739};
740
741struct drbd_wq_barrier { 743struct drbd_wq_barrier {
742 struct drbd_work w; 744 struct drbd_work w;
743 struct completion done; 745 struct completion done;
@@ -748,17 +750,49 @@ struct digest_info {
748 void *digest; 750 void *digest;
749}; 751};
750 752
751/* ee flag bits */ 753struct drbd_epoch_entry {
754 struct drbd_work w;
755 struct hlist_node colision;
756 struct drbd_epoch *epoch;
757 struct drbd_conf *mdev;
758 struct page *pages;
759 atomic_t pending_bios;
760 unsigned int size;
761 /* see comments on ee flag bits below */
762 unsigned long flags;
763 sector_t sector;
764 u64 block_id;
765};
766
767/* ee flag bits.
768 * While corresponding bios are in flight, the only modification will be
769 * set_bit WAS_ERROR, which has to be atomic.
770 * If no bios are in flight yet, or all have been completed,
771 * non-atomic modification to ee->flags is ok.
772 */
752enum { 773enum {
753 __EE_CALL_AL_COMPLETE_IO, 774 __EE_CALL_AL_COMPLETE_IO,
754 __EE_CONFLICT_PENDING,
755 __EE_MAY_SET_IN_SYNC, 775 __EE_MAY_SET_IN_SYNC,
776
777 /* This epoch entry closes an epoch using a barrier.
778 * On sucessful completion, the epoch is released,
779 * and the P_BARRIER_ACK send. */
756 __EE_IS_BARRIER, 780 __EE_IS_BARRIER,
781
782 /* In case a barrier failed,
783 * we need to resubmit without the barrier flag. */
784 __EE_RESUBMITTED,
785
786 /* we may have several bios per epoch entry.
787 * if any of those fail, we set this flag atomically
788 * from the endio callback */
789 __EE_WAS_ERROR,
757}; 790};
758#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 791#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
759#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
760#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 792#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
761#define EE_IS_BARRIER (1<<__EE_IS_BARRIER) 793#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
794#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
795#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
762 796
763/* global flag bits */ 797/* global flag bits */
764enum { 798enum {
@@ -908,9 +942,12 @@ struct drbd_conf {
908 unsigned int ko_count; 942 unsigned int ko_count;
909 struct drbd_work resync_work, 943 struct drbd_work resync_work,
910 unplug_work, 944 unplug_work,
911 md_sync_work; 945 md_sync_work,
946 delay_probe_work,
947 uuid_work;
912 struct timer_list resync_timer; 948 struct timer_list resync_timer;
913 struct timer_list md_sync_timer; 949 struct timer_list md_sync_timer;
950 struct timer_list delay_probe_timer;
914 951
915 /* Used after attach while negotiating new disk state. */ 952 /* Used after attach while negotiating new disk state. */
916 union drbd_state new_state_tmp; 953 union drbd_state new_state_tmp;
@@ -1026,6 +1063,13 @@ struct drbd_conf {
1026 u64 ed_uuid; /* UUID of the exposed data */ 1063 u64 ed_uuid; /* UUID of the exposed data */
1027 struct mutex state_mutex; 1064 struct mutex state_mutex;
1028 char congestion_reason; /* Why we where congested... */ 1065 char congestion_reason; /* Why we where congested... */
1066 struct list_head delay_probes; /* protected by peer_seq_lock */
1067 int data_delay; /* Delay of packets on the data-sock behind meta-sock */
1068 unsigned int delay_seq; /* To generate sequence numbers of delay probes */
1069 struct timeval dps_time; /* delay-probes-start-time */
1070 unsigned int dp_volume_last; /* send_cnt of last delay probe */
1071 int c_sync_rate; /* current resync rate after delay_probe magic */
1072 atomic_t new_c_uuid;
1029}; 1073};
1030 1074
1031static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1075static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1081,6 +1125,11 @@ enum chg_state_flags {
1081 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, 1125 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
1082}; 1126};
1083 1127
1128enum dds_flags {
1129 DDSF_FORCED = 1,
1130 DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
1131};
1132
1084extern void drbd_init_set_defaults(struct drbd_conf *mdev); 1133extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1085extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 1134extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
1086 union drbd_state mask, union drbd_state val); 1135 union drbd_state mask, union drbd_state val);
@@ -1113,7 +1162,7 @@ extern int drbd_send_protocol(struct drbd_conf *mdev);
1113extern int drbd_send_uuids(struct drbd_conf *mdev); 1162extern int drbd_send_uuids(struct drbd_conf *mdev);
1114extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); 1163extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1115extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); 1164extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
1116extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); 1165extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
1117extern int _drbd_send_state(struct drbd_conf *mdev); 1166extern int _drbd_send_state(struct drbd_conf *mdev);
1118extern int drbd_send_state(struct drbd_conf *mdev); 1167extern int drbd_send_state(struct drbd_conf *mdev);
1119extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1168extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
@@ -1311,7 +1360,7 @@ struct bm_extent {
1311#define APP_R_HSIZE 15 1360#define APP_R_HSIZE 15
1312 1361
1313extern int drbd_bm_init(struct drbd_conf *mdev); 1362extern int drbd_bm_init(struct drbd_conf *mdev);
1314extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); 1363extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
1315extern void drbd_bm_cleanup(struct drbd_conf *mdev); 1364extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1316extern void drbd_bm_set_all(struct drbd_conf *mdev); 1365extern void drbd_bm_set_all(struct drbd_conf *mdev);
1317extern void drbd_bm_clear_all(struct drbd_conf *mdev); 1366extern void drbd_bm_clear_all(struct drbd_conf *mdev);
@@ -1383,7 +1432,7 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
1383extern char *ppsize(char *buf, unsigned long long size); 1432extern char *ppsize(char *buf, unsigned long long size);
1384extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); 1433extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
1385enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; 1434enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1386extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local); 1435extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
1387extern void resync_after_online_grow(struct drbd_conf *); 1436extern void resync_after_online_grow(struct drbd_conf *);
1388extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1437extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1389extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, 1438extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
@@ -1414,7 +1463,8 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
1414} 1463}
1415 1464
1416 1465
1417extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); 1466extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
1467extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
1418/* worker callbacks */ 1468/* worker callbacks */
1419extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); 1469extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
1420extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); 1470extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
@@ -1438,6 +1488,8 @@ extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1438extern void resync_timer_fn(unsigned long data); 1488extern void resync_timer_fn(unsigned long data);
1439 1489
1440/* drbd_receiver.c */ 1490/* drbd_receiver.c */
1491extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1492 const unsigned rw, const int fault_type);
1441extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); 1493extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
1442extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, 1494extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1443 u64 id, 1495 u64 id,
@@ -1593,6 +1645,41 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
1593 * inline helper functions 1645 * inline helper functions
1594 *************************/ 1646 *************************/
1595 1647
1648/* see also page_chain_add and friends in drbd_receiver.c */
1649static inline struct page *page_chain_next(struct page *page)
1650{
1651 return (struct page *)page_private(page);
1652}
1653#define page_chain_for_each(page) \
1654 for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
1655 page = page_chain_next(page))
1656#define page_chain_for_each_safe(page, n) \
1657 for (; page && ({ n = page_chain_next(page); 1; }); page = n)
1658
1659static inline int drbd_bio_has_active_page(struct bio *bio)
1660{
1661 struct bio_vec *bvec;
1662 int i;
1663
1664 __bio_for_each_segment(bvec, bio, i, 0) {
1665 if (page_count(bvec->bv_page) > 1)
1666 return 1;
1667 }
1668
1669 return 0;
1670}
1671
1672static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
1673{
1674 struct page *page = e->pages;
1675 page_chain_for_each(page) {
1676 if (page_count(page) > 1)
1677 return 1;
1678 }
1679 return 0;
1680}
1681
1682
1596static inline void drbd_state_lock(struct drbd_conf *mdev) 1683static inline void drbd_state_lock(struct drbd_conf *mdev)
1597{ 1684{
1598 wait_event(mdev->misc_wait, 1685 wait_event(mdev->misc_wait,
@@ -2132,13 +2219,15 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2132 return 0; 2219 return 0;
2133 if (test_bit(BITMAP_IO, &mdev->flags)) 2220 if (test_bit(BITMAP_IO, &mdev->flags))
2134 return 0; 2221 return 0;
2222 if (atomic_read(&mdev->new_c_uuid))
2223 return 0;
2135 return 1; 2224 return 1;
2136} 2225}
2137 2226
2138/* I'd like to use wait_event_lock_irq, 2227/* I'd like to use wait_event_lock_irq,
2139 * but I'm not sure when it got introduced, 2228 * but I'm not sure when it got introduced,
2140 * and not sure when it has 3 or 4 arguments */ 2229 * and not sure when it has 3 or 4 arguments */
2141static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) 2230static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2142{ 2231{
2143 /* compare with after_state_ch, 2232 /* compare with after_state_ch,
2144 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ 2233 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
@@ -2152,6 +2241,9 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2152 * to avoid races with the reconnect code, 2241 * to avoid races with the reconnect code,
2153 * we need to atomic_inc within the spinlock. */ 2242 * we need to atomic_inc within the spinlock. */
2154 2243
2244 if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
2245 drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
2246
2155 spin_lock_irq(&mdev->req_lock); 2247 spin_lock_irq(&mdev->req_lock);
2156 while (!__inc_ap_bio_cond(mdev)) { 2248 while (!__inc_ap_bio_cond(mdev)) {
2157 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); 2249 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
@@ -2160,7 +2252,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2160 finish_wait(&mdev->misc_wait, &wait); 2252 finish_wait(&mdev->misc_wait, &wait);
2161 spin_lock_irq(&mdev->req_lock); 2253 spin_lock_irq(&mdev->req_lock);
2162 } 2254 }
2163 atomic_add(one_or_two, &mdev->ap_bio_cnt); 2255 atomic_add(count, &mdev->ap_bio_cnt);
2164 spin_unlock_irq(&mdev->req_lock); 2256 spin_unlock_irq(&mdev->req_lock);
2165} 2257}
2166 2258
@@ -2251,7 +2343,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
2251 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2343 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2252 return; 2344 return;
2253 2345
2254 r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); 2346 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
2347 BLKDEV_IFL_WAIT);
2255 if (r) { 2348 if (r) {
2256 set_bit(MD_NO_BARRIER, &mdev->flags); 2349 set_bit(MD_NO_BARRIER, &mdev->flags);
2257 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2350 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 93d1f9b469d4..be2d2da9cdba 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -684,6 +684,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) 684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK; 685 rv = SS_NO_REMOTE_DISK;
686 686
687 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
688 rv = SS_NO_UP_TO_DATE_DISK;
689
687 else if ((ns.conn == C_CONNECTED || 690 else if ((ns.conn == C_CONNECTED ||
688 ns.conn == C_WF_BITMAP_S || 691 ns.conn == C_WF_BITMAP_S ||
689 ns.conn == C_SYNC_SOURCE || 692 ns.conn == C_SYNC_SOURCE ||
@@ -840,7 +843,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
840 break; 843 break;
841 case C_WF_BITMAP_S: 844 case C_WF_BITMAP_S:
842 case C_PAUSED_SYNC_S: 845 case C_PAUSED_SYNC_S:
843 ns.pdsk = D_OUTDATED; 846 /* remap any consistent state to D_OUTDATED,
847 * but disallow "upgrade" of not even consistent states.
848 */
849 ns.pdsk =
850 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851 ? os.pdsk : D_OUTDATED;
844 break; 852 break;
845 case C_SYNC_SOURCE: 853 case C_SYNC_SOURCE:
846 ns.pdsk = D_INCONSISTENT; 854 ns.pdsk = D_INCONSISTENT;
@@ -1205,21 +1213,20 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1205 && (ns.pdsk < D_INCONSISTENT || 1213 && (ns.pdsk < D_INCONSISTENT ||
1206 ns.pdsk == D_UNKNOWN || 1214 ns.pdsk == D_UNKNOWN ||
1207 ns.pdsk == D_OUTDATED)) { 1215 ns.pdsk == D_OUTDATED)) {
1208 kfree(mdev->p_uuid);
1209 mdev->p_uuid = NULL;
1210 if (get_ldev(mdev)) { 1216 if (get_ldev(mdev)) {
1211 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1217 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1212 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1218 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
1213 drbd_uuid_new_current(mdev); 1219 !atomic_read(&mdev->new_c_uuid))
1214 drbd_send_uuids(mdev); 1220 atomic_set(&mdev->new_c_uuid, 2);
1215 }
1216 put_ldev(mdev); 1221 put_ldev(mdev);
1217 } 1222 }
1218 } 1223 }
1219 1224
1220 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1225 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1221 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) 1226 /* Diskless peer becomes primary or got connected do diskless, primary peer. */
1222 drbd_uuid_new_current(mdev); 1227 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
1228 !atomic_read(&mdev->new_c_uuid))
1229 atomic_set(&mdev->new_c_uuid, 2);
1223 1230
1224 /* D_DISKLESS Peer becomes secondary */ 1231 /* D_DISKLESS Peer becomes secondary */
1225 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1232 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1232,7 +1239,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1232 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { 1239 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1233 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ 1240 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1234 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ 1241 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1235 drbd_send_sizes(mdev, 0); /* to start sync... */ 1242 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
1236 drbd_send_uuids(mdev); 1243 drbd_send_uuids(mdev);
1237 drbd_send_state(mdev); 1244 drbd_send_state(mdev);
1238 } 1245 }
@@ -1343,6 +1350,24 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1343 drbd_md_sync(mdev); 1350 drbd_md_sync(mdev);
1344} 1351}
1345 1352
1353static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1354{
1355 if (get_ldev(mdev)) {
1356 if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1357 drbd_uuid_new_current(mdev);
1358 if (get_net_conf(mdev)) {
1359 drbd_send_uuids(mdev);
1360 put_net_conf(mdev);
1361 }
1362 drbd_md_sync(mdev);
1363 }
1364 put_ldev(mdev);
1365 }
1366 atomic_dec(&mdev->new_c_uuid);
1367 wake_up(&mdev->misc_wait);
1368
1369 return 1;
1370}
1346 1371
1347static int drbd_thread_setup(void *arg) 1372static int drbd_thread_setup(void *arg)
1348{ 1373{
@@ -1755,7 +1780,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1755 (struct p_header *)&p, sizeof(p)); 1780 (struct p_header *)&p, sizeof(p));
1756} 1781}
1757 1782
1758int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) 1783int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1759{ 1784{
1760 struct p_sizes p; 1785 struct p_sizes p;
1761 sector_t d_size, u_size; 1786 sector_t d_size, u_size;
@@ -1767,7 +1792,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1767 d_size = drbd_get_max_capacity(mdev->ldev); 1792 d_size = drbd_get_max_capacity(mdev->ldev);
1768 u_size = mdev->ldev->dc.disk_size; 1793 u_size = mdev->ldev->dc.disk_size;
1769 q_order_type = drbd_queue_order_type(mdev); 1794 q_order_type = drbd_queue_order_type(mdev);
1770 p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
1771 put_ldev(mdev); 1795 put_ldev(mdev);
1772 } else { 1796 } else {
1773 d_size = 0; 1797 d_size = 0;
@@ -1779,7 +1803,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1779 p.u_size = cpu_to_be64(u_size); 1803 p.u_size = cpu_to_be64(u_size);
1780 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 1804 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1781 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); 1805 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1782 p.queue_order_type = cpu_to_be32(q_order_type); 1806 p.queue_order_type = cpu_to_be16(q_order_type);
1807 p.dds_flags = cpu_to_be16(flags);
1783 1808
1784 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, 1809 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1785 (struct p_header *)&p, sizeof(p)); 1810 (struct p_header *)&p, sizeof(p));
@@ -2180,6 +2205,43 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2180 return ok; 2205 return ok;
2181} 2206}
2182 2207
2208static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
2209{
2210 struct p_delay_probe dp;
2211 int offset, ok = 0;
2212 struct timeval now;
2213
2214 mutex_lock(&ds->mutex);
2215 if (likely(ds->socket)) {
2216 do_gettimeofday(&now);
2217 offset = now.tv_usec - mdev->dps_time.tv_usec +
2218 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
2219 dp.seq_num = cpu_to_be32(mdev->delay_seq);
2220 dp.offset = cpu_to_be32(offset);
2221
2222 ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
2223 (struct p_header *)&dp, sizeof(dp), 0);
2224 }
2225 mutex_unlock(&ds->mutex);
2226
2227 return ok;
2228}
2229
2230static int drbd_send_delay_probes(struct drbd_conf *mdev)
2231{
2232 int ok;
2233
2234 mdev->delay_seq++;
2235 do_gettimeofday(&mdev->dps_time);
2236 ok = drbd_send_delay_probe(mdev, &mdev->meta);
2237 ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
2238
2239 mdev->dp_volume_last = mdev->send_cnt;
2240 mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
2241
2242 return ok;
2243}
2244
2183/* called on sndtimeo 2245/* called on sndtimeo
2184 * returns FALSE if we should retry, 2246 * returns FALSE if we should retry,
2185 * TRUE if we think connection is dead 2247 * TRUE if we think connection is dead
@@ -2309,6 +2371,44 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2309 return 1; 2371 return 1;
2310} 2372}
2311 2373
2374static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2375{
2376 struct page *page = e->pages;
2377 unsigned len = e->size;
2378 page_chain_for_each(page) {
2379 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2380 if (!_drbd_send_page(mdev, page, 0, l))
2381 return 0;
2382 len -= l;
2383 }
2384 return 1;
2385}
2386
2387static void consider_delay_probes(struct drbd_conf *mdev)
2388{
2389 if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
2390 return;
2391
2392 if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
2393 drbd_send_delay_probes(mdev);
2394}
2395
2396static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
2397{
2398 if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
2399 drbd_send_delay_probes(mdev);
2400
2401 return 1;
2402}
2403
2404static void delay_probe_timer_fn(unsigned long data)
2405{
2406 struct drbd_conf *mdev = (struct drbd_conf *) data;
2407
2408 if (list_empty(&mdev->delay_probe_work.list))
2409 drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
2410}
2411
2312/* Used to send write requests 2412/* Used to send write requests
2313 * R_PRIMARY -> Peer (P_DATA) 2413 * R_PRIMARY -> Peer (P_DATA)
2314 */ 2414 */
@@ -2360,7 +2460,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2360 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); 2460 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2361 if (ok && dgs) { 2461 if (ok && dgs) {
2362 dgb = mdev->int_dig_out; 2462 dgb = mdev->int_dig_out;
2363 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); 2463 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2364 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2464 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2365 } 2465 }
2366 if (ok) { 2466 if (ok) {
@@ -2371,6 +2471,10 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2371 } 2471 }
2372 2472
2373 drbd_put_data_sock(mdev); 2473 drbd_put_data_sock(mdev);
2474
2475 if (ok)
2476 consider_delay_probes(mdev);
2477
2374 return ok; 2478 return ok;
2375} 2479}
2376 2480
@@ -2409,13 +2513,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2409 sizeof(p), MSG_MORE); 2513 sizeof(p), MSG_MORE);
2410 if (ok && dgs) { 2514 if (ok && dgs) {
2411 dgb = mdev->int_dig_out; 2515 dgb = mdev->int_dig_out;
2412 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); 2516 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2413 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2517 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2414 } 2518 }
2415 if (ok) 2519 if (ok)
2416 ok = _drbd_send_zc_bio(mdev, e->private_bio); 2520 ok = _drbd_send_zc_ee(mdev, e);
2417 2521
2418 drbd_put_data_sock(mdev); 2522 drbd_put_data_sock(mdev);
2523
2524 if (ok)
2525 consider_delay_probes(mdev);
2526
2419 return ok; 2527 return ok;
2420} 2528}
2421 2529
@@ -2600,6 +2708,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2600 atomic_set(&mdev->net_cnt, 0); 2708 atomic_set(&mdev->net_cnt, 0);
2601 atomic_set(&mdev->packet_seq, 0); 2709 atomic_set(&mdev->packet_seq, 0);
2602 atomic_set(&mdev->pp_in_use, 0); 2710 atomic_set(&mdev->pp_in_use, 0);
2711 atomic_set(&mdev->new_c_uuid, 0);
2603 2712
2604 mutex_init(&mdev->md_io_mutex); 2713 mutex_init(&mdev->md_io_mutex);
2605 mutex_init(&mdev->data.mutex); 2714 mutex_init(&mdev->data.mutex);
@@ -2628,16 +2737,26 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2628 INIT_LIST_HEAD(&mdev->unplug_work.list); 2737 INIT_LIST_HEAD(&mdev->unplug_work.list);
2629 INIT_LIST_HEAD(&mdev->md_sync_work.list); 2738 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2630 INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 2739 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2740 INIT_LIST_HEAD(&mdev->delay_probes);
2741 INIT_LIST_HEAD(&mdev->delay_probe_work.list);
2742 INIT_LIST_HEAD(&mdev->uuid_work.list);
2743
2631 mdev->resync_work.cb = w_resync_inactive; 2744 mdev->resync_work.cb = w_resync_inactive;
2632 mdev->unplug_work.cb = w_send_write_hint; 2745 mdev->unplug_work.cb = w_send_write_hint;
2633 mdev->md_sync_work.cb = w_md_sync; 2746 mdev->md_sync_work.cb = w_md_sync;
2634 mdev->bm_io_work.w.cb = w_bitmap_io; 2747 mdev->bm_io_work.w.cb = w_bitmap_io;
2748 mdev->delay_probe_work.cb = w_delay_probes;
2749 mdev->uuid_work.cb = w_new_current_uuid;
2635 init_timer(&mdev->resync_timer); 2750 init_timer(&mdev->resync_timer);
2636 init_timer(&mdev->md_sync_timer); 2751 init_timer(&mdev->md_sync_timer);
2752 init_timer(&mdev->delay_probe_timer);
2637 mdev->resync_timer.function = resync_timer_fn; 2753 mdev->resync_timer.function = resync_timer_fn;
2638 mdev->resync_timer.data = (unsigned long) mdev; 2754 mdev->resync_timer.data = (unsigned long) mdev;
2639 mdev->md_sync_timer.function = md_sync_timer_fn; 2755 mdev->md_sync_timer.function = md_sync_timer_fn;
2640 mdev->md_sync_timer.data = (unsigned long) mdev; 2756 mdev->md_sync_timer.data = (unsigned long) mdev;
2757 mdev->delay_probe_timer.function = delay_probe_timer_fn;
2758 mdev->delay_probe_timer.data = (unsigned long) mdev;
2759
2641 2760
2642 init_waitqueue_head(&mdev->misc_wait); 2761 init_waitqueue_head(&mdev->misc_wait);
2643 init_waitqueue_head(&mdev->state_wait); 2762 init_waitqueue_head(&mdev->state_wait);
@@ -2680,7 +2799,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2680 drbd_set_my_capacity(mdev, 0); 2799 drbd_set_my_capacity(mdev, 0);
2681 if (mdev->bitmap) { 2800 if (mdev->bitmap) {
2682 /* maybe never allocated. */ 2801 /* maybe never allocated. */
2683 drbd_bm_resize(mdev, 0); 2802 drbd_bm_resize(mdev, 0, 1);
2684 drbd_bm_cleanup(mdev); 2803 drbd_bm_cleanup(mdev);
2685 } 2804 }
2686 2805
@@ -3129,7 +3248,7 @@ int __init drbd_init(void)
3129 if (err) 3248 if (err)
3130 goto Enomem; 3249 goto Enomem;
3131 3250
3132 drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); 3251 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3133 if (!drbd_proc) { 3252 if (!drbd_proc) {
3134 printk(KERN_ERR "drbd: unable to register proc file\n"); 3253 printk(KERN_ERR "drbd: unable to register proc file\n");
3135 goto Enomem; 3254 goto Enomem;
@@ -3660,7 +3779,8 @@ _drbd_fault_str(unsigned int type) {
3660 [DRBD_FAULT_DT_RD] = "Data read", 3779 [DRBD_FAULT_DT_RD] = "Data read",
3661 [DRBD_FAULT_DT_RA] = "Data read ahead", 3780 [DRBD_FAULT_DT_RA] = "Data read ahead",
3662 [DRBD_FAULT_BM_ALLOC] = "BM allocation", 3781 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3663 [DRBD_FAULT_AL_EE] = "EE allocation" 3782 [DRBD_FAULT_AL_EE] = "EE allocation",
3783 [DRBD_FAULT_RECEIVE] = "receive data corruption",
3664 }; 3784 };
3665 3785
3666 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; 3786 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6429d2b19e06..632e3245d1bb 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
510 * Returns 0 on success, negative return values indicate errors. 510 * Returns 0 on success, negative return values indicate errors.
511 * You should call drbd_md_sync() after calling this function. 511 * You should call drbd_md_sync() after calling this function.
512 */ 512 */
513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local) 513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
514{ 514{
515 sector_t prev_first_sect, prev_size; /* previous meta location */ 515 sector_t prev_first_sect, prev_size; /* previous meta location */
516 sector_t la_size; 516 sector_t la_size;
@@ -541,12 +541,12 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force
541 /* TODO: should only be some assert here, not (re)init... */ 541 /* TODO: should only be some assert here, not (re)init... */
542 drbd_md_set_sector_offsets(mdev, mdev->ldev); 542 drbd_md_set_sector_offsets(mdev, mdev->ldev);
543 543
544 size = drbd_new_dev_size(mdev, mdev->ldev, force); 544 size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
545 545
546 if (drbd_get_capacity(mdev->this_bdev) != size || 546 if (drbd_get_capacity(mdev->this_bdev) != size ||
547 drbd_bm_capacity(mdev) != size) { 547 drbd_bm_capacity(mdev) != size) {
548 int err; 548 int err;
549 err = drbd_bm_resize(mdev, size); 549 err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
550 if (unlikely(err)) { 550 if (unlikely(err)) {
551 /* currently there is only one error: ENOMEM! */ 551 /* currently there is only one error: ENOMEM! */
552 size = drbd_bm_capacity(mdev)>>1; 552 size = drbd_bm_capacity(mdev)>>1;
@@ -704,9 +704,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
704 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 704 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
705 int max_segments = mdev->ldev->dc.max_bio_bvecs; 705 int max_segments = mdev->ldev->dc.max_bio_bvecs;
706 706
707 if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
708 max_seg_s = PAGE_SIZE;
709
710 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); 707 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
711 708
712 blk_queue_max_hw_sectors(q, max_seg_s >> 9); 709 blk_queue_max_hw_sectors(q, max_seg_s >> 9);
@@ -1199,13 +1196,12 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1199 } 1196 }
1200 1197
1201 /* allocation not in the IO path, cqueue thread context */ 1198 /* allocation not in the IO path, cqueue thread context */
1202 new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); 1199 new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
1203 if (!new_conf) { 1200 if (!new_conf) {
1204 retcode = ERR_NOMEM; 1201 retcode = ERR_NOMEM;
1205 goto fail; 1202 goto fail;
1206 } 1203 }
1207 1204
1208 memset(new_conf, 0, sizeof(struct net_conf));
1209 new_conf->timeout = DRBD_TIMEOUT_DEF; 1205 new_conf->timeout = DRBD_TIMEOUT_DEF;
1210 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; 1206 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
1211 new_conf->ping_int = DRBD_PING_INT_DEF; 1207 new_conf->ping_int = DRBD_PING_INT_DEF;
@@ -1477,8 +1473,8 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1477{ 1473{
1478 struct resize rs; 1474 struct resize rs;
1479 int retcode = NO_ERROR; 1475 int retcode = NO_ERROR;
1480 int ldsc = 0; /* local disk size changed */
1481 enum determine_dev_size dd; 1476 enum determine_dev_size dd;
1477 enum dds_flags ddsf;
1482 1478
1483 memset(&rs, 0, sizeof(struct resize)); 1479 memset(&rs, 0, sizeof(struct resize));
1484 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { 1480 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
@@ -1502,13 +1498,17 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1502 goto fail; 1498 goto fail;
1503 } 1499 }
1504 1500
1505 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 1501 if (rs.no_resync && mdev->agreed_pro_version < 93) {
1506 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 1502 retcode = ERR_NEED_APV_93;
1507 ldsc = 1; 1503 goto fail;
1508 } 1504 }
1509 1505
1506 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
1507 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
1508
1510 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; 1509 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1511 dd = drbd_determin_dev_size(mdev, rs.resize_force); 1510 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
1511 dd = drbd_determin_dev_size(mdev, ddsf);
1512 drbd_md_sync(mdev); 1512 drbd_md_sync(mdev);
1513 put_ldev(mdev); 1513 put_ldev(mdev);
1514 if (dd == dev_size_error) { 1514 if (dd == dev_size_error) {
@@ -1516,12 +1516,12 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1516 goto fail; 1516 goto fail;
1517 } 1517 }
1518 1518
1519 if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { 1519 if (mdev->state.conn == C_CONNECTED) {
1520 if (dd == grew) 1520 if (dd == grew)
1521 set_bit(RESIZE_PENDING, &mdev->flags); 1521 set_bit(RESIZE_PENDING, &mdev->flags);
1522 1522
1523 drbd_send_uuids(mdev); 1523 drbd_send_uuids(mdev);
1524 drbd_send_sizes(mdev, 1); 1524 drbd_send_sizes(mdev, 1, ddsf);
1525 } 1525 }
1526 1526
1527 fail: 1527 fail:
@@ -1551,6 +1551,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1551 sc.rate = DRBD_RATE_DEF; 1551 sc.rate = DRBD_RATE_DEF;
1552 sc.after = DRBD_AFTER_DEF; 1552 sc.after = DRBD_AFTER_DEF;
1553 sc.al_extents = DRBD_AL_EXTENTS_DEF; 1553 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1554 sc.dp_volume = DRBD_DP_VOLUME_DEF;
1555 sc.dp_interval = DRBD_DP_INTERVAL_DEF;
1556 sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
1557 sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
1554 } else 1558 } else
1555 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); 1559 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1556 1560
@@ -2207,9 +2211,9 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2207{ 2211{
2208 struct cn_msg *cn_reply; 2212 struct cn_msg *cn_reply;
2209 struct drbd_nl_cfg_reply *reply; 2213 struct drbd_nl_cfg_reply *reply;
2210 struct bio_vec *bvec;
2211 unsigned short *tl; 2214 unsigned short *tl;
2212 int i; 2215 struct page *page;
2216 unsigned len;
2213 2217
2214 if (!e) 2218 if (!e)
2215 return; 2219 return;
@@ -2247,11 +2251,15 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2247 put_unaligned(T_ee_data, tl++); 2251 put_unaligned(T_ee_data, tl++);
2248 put_unaligned(e->size, tl++); 2252 put_unaligned(e->size, tl++);
2249 2253
2250 __bio_for_each_segment(bvec, e->private_bio, i, 0) { 2254 len = e->size;
2251 void *d = kmap(bvec->bv_page); 2255 page = e->pages;
2252 memcpy(tl, d + bvec->bv_offset, bvec->bv_len); 2256 page_chain_for_each(page) {
2253 kunmap(bvec->bv_page); 2257 void *d = kmap_atomic(page, KM_USER0);
2254 tl=(unsigned short*)((char*)tl + bvec->bv_len); 2258 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2259 memcpy(tl, d, l);
2260 kunmap_atomic(d, KM_USER0);
2261 tl = (unsigned short*)((char*)tl + l);
2262 len -= l;
2255 } 2263 }
2256 put_unaligned(TT_END, tl++); /* Close the tag list */ 2264 put_unaligned(TT_END, tl++); /* Close the tag list */
2257 2265
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b68460..d0f1767ea4c3 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -73,14 +73,21 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
73 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); 73 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
74 /* if more than 1 GB display in MB */ 74 /* if more than 1 GB display in MB */
75 if (mdev->rs_total > 0x100000L) 75 if (mdev->rs_total > 0x100000L)
76 seq_printf(seq, "(%lu/%lu)M\n\t", 76 seq_printf(seq, "(%lu/%lu)M",
77 (unsigned long) Bit2KB(rs_left >> 10), 77 (unsigned long) Bit2KB(rs_left >> 10),
78 (unsigned long) Bit2KB(mdev->rs_total >> 10)); 78 (unsigned long) Bit2KB(mdev->rs_total >> 10));
79 else 79 else
80 seq_printf(seq, "(%lu/%lu)K\n\t", 80 seq_printf(seq, "(%lu/%lu)K",
81 (unsigned long) Bit2KB(rs_left), 81 (unsigned long) Bit2KB(rs_left),
82 (unsigned long) Bit2KB(mdev->rs_total)); 82 (unsigned long) Bit2KB(mdev->rs_total));
83 83
84 if (mdev->state.conn == C_SYNC_TARGET)
85 seq_printf(seq, " queue_delay: %d.%d ms\n\t",
86 mdev->data_delay / 1000,
87 (mdev->data_delay % 1000) / 100);
88 else if (mdev->state.conn == C_SYNC_SOURCE)
89 seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq);
90
84 /* see drivers/md/md.c 91 /* see drivers/md/md.c
85 * We do not want to overflow, so the order of operands and 92 * We do not want to overflow, so the order of operands and
86 * the * 100 / 100 trick are important. We do a +1 to be 93 * the * 100 / 100 trick are important. We do a +1 to be
@@ -128,6 +135,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
128 else 135 else
129 seq_printf(seq, " (%ld)", dbdt); 136 seq_printf(seq, " (%ld)", dbdt);
130 137
138 if (mdev->state.conn == C_SYNC_TARGET) {
139 if (mdev->c_sync_rate > 1000)
140 seq_printf(seq, " want: %d,%03d",
141 mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
142 else
143 seq_printf(seq, " want: %d", mdev->c_sync_rate);
144 }
145
131 seq_printf(seq, " K/sec\n"); 146 seq_printf(seq, " K/sec\n");
132} 147}
133 148
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3f096e7959b4..bc9ab7fb2cc7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -80,30 +80,128 @@ static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epo
80 80
81#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 81#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
82 82
83static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) 83/*
84 * some helper functions to deal with single linked page lists,
85 * page->private being our "next" pointer.
86 */
87
88/* If at least n pages are linked at head, get n pages off.
89 * Otherwise, don't modify head, and return NULL.
90 * Locking is the responsibility of the caller.
91 */
92static struct page *page_chain_del(struct page **head, int n)
93{
94 struct page *page;
95 struct page *tmp;
96
97 BUG_ON(!n);
98 BUG_ON(!head);
99
100 page = *head;
101
102 if (!page)
103 return NULL;
104
105 while (page) {
106 tmp = page_chain_next(page);
107 if (--n == 0)
108 break; /* found sufficient pages */
109 if (tmp == NULL)
110 /* insufficient pages, don't use any of them. */
111 return NULL;
112 page = tmp;
113 }
114
115 /* add end of list marker for the returned list */
116 set_page_private(page, 0);
117 /* actual return value, and adjustment of head */
118 page = *head;
119 *head = tmp;
120 return page;
121}
122
123/* may be used outside of locks to find the tail of a (usually short)
124 * "private" page chain, before adding it back to a global chain head
125 * with page_chain_add() under a spinlock. */
126static struct page *page_chain_tail(struct page *page, int *len)
127{
128 struct page *tmp;
129 int i = 1;
130 while ((tmp = page_chain_next(page)))
131 ++i, page = tmp;
132 if (len)
133 *len = i;
134 return page;
135}
136
137static int page_chain_free(struct page *page)
138{
139 struct page *tmp;
140 int i = 0;
141 page_chain_for_each_safe(page, tmp) {
142 put_page(page);
143 ++i;
144 }
145 return i;
146}
147
148static void page_chain_add(struct page **head,
149 struct page *chain_first, struct page *chain_last)
150{
151#if 1
152 struct page *tmp;
153 tmp = page_chain_tail(chain_first, NULL);
154 BUG_ON(tmp != chain_last);
155#endif
156
157 /* add chain to head */
158 set_page_private(chain_last, (unsigned long)*head);
159 *head = chain_first;
160}
161
162static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
84{ 163{
85 struct page *page = NULL; 164 struct page *page = NULL;
165 struct page *tmp = NULL;
166 int i = 0;
86 167
87 /* Yes, testing drbd_pp_vacant outside the lock is racy. 168 /* Yes, testing drbd_pp_vacant outside the lock is racy.
88 * So what. It saves a spin_lock. */ 169 * So what. It saves a spin_lock. */
89 if (drbd_pp_vacant > 0) { 170 if (drbd_pp_vacant >= number) {
90 spin_lock(&drbd_pp_lock); 171 spin_lock(&drbd_pp_lock);
91 page = drbd_pp_pool; 172 page = page_chain_del(&drbd_pp_pool, number);
92 if (page) { 173 if (page)
93 drbd_pp_pool = (struct page *)page_private(page); 174 drbd_pp_vacant -= number;
94 set_page_private(page, 0); /* just to be polite */
95 drbd_pp_vacant--;
96 }
97 spin_unlock(&drbd_pp_lock); 175 spin_unlock(&drbd_pp_lock);
176 if (page)
177 return page;
98 } 178 }
179
99 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD 180 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
100 * "criss-cross" setup, that might cause write-out on some other DRBD, 181 * "criss-cross" setup, that might cause write-out on some other DRBD,
101 * which in turn might block on the other node at this very place. */ 182 * which in turn might block on the other node at this very place. */
102 if (!page) 183 for (i = 0; i < number; i++) {
103 page = alloc_page(GFP_TRY); 184 tmp = alloc_page(GFP_TRY);
104 if (page) 185 if (!tmp)
105 atomic_inc(&mdev->pp_in_use); 186 break;
106 return page; 187 set_page_private(tmp, (unsigned long)page);
188 page = tmp;
189 }
190
191 if (i == number)
192 return page;
193
194 /* Not enough pages immediately available this time.
195 * No need to jump around here, drbd_pp_alloc will retry this
196 * function "soon". */
197 if (page) {
198 tmp = page_chain_tail(page, NULL);
199 spin_lock(&drbd_pp_lock);
200 page_chain_add(&drbd_pp_pool, page, tmp);
201 drbd_pp_vacant += i;
202 spin_unlock(&drbd_pp_lock);
203 }
204 return NULL;
107} 205}
108 206
109/* kick lower level device, if we have more than (arbitrary number) 207/* kick lower level device, if we have more than (arbitrary number)
@@ -127,7 +225,7 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
127 225
128 list_for_each_safe(le, tle, &mdev->net_ee) { 226 list_for_each_safe(le, tle, &mdev->net_ee) {
129 e = list_entry(le, struct drbd_epoch_entry, w.list); 227 e = list_entry(le, struct drbd_epoch_entry, w.list);
130 if (drbd_bio_has_active_page(e->private_bio)) 228 if (drbd_ee_has_active_page(e))
131 break; 229 break;
132 list_move(le, to_be_freed); 230 list_move(le, to_be_freed);
133 } 231 }
@@ -148,32 +246,34 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
148} 246}
149 247
150/** 248/**
151 * drbd_pp_alloc() - Returns a page, fails only if a signal comes in 249 * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
152 * @mdev: DRBD device. 250 * @mdev: DRBD device.
153 * @retry: whether or not to retry allocation forever (or until signalled) 251 * @number: number of pages requested
252 * @retry: whether to retry, if not enough pages are available right now
253 *
254 * Tries to allocate number pages, first from our own page pool, then from
255 * the kernel, unless this allocation would exceed the max_buffers setting.
256 * Possibly retry until DRBD frees sufficient pages somewhere else.
154 * 257 *
155 * Tries to allocate a page, first from our own page pool, then from the 258 * Returns a page chain linked via page->private.
156 * kernel, unless this allocation would exceed the max_buffers setting.
157 * If @retry is non-zero, retry until DRBD frees a page somewhere else.
158 */ 259 */
159static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) 260static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
160{ 261{
161 struct page *page = NULL; 262 struct page *page = NULL;
162 DEFINE_WAIT(wait); 263 DEFINE_WAIT(wait);
163 264
164 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { 265 /* Yes, we may run up to @number over max_buffers. If we
165 page = drbd_pp_first_page_or_try_alloc(mdev); 266 * follow it strictly, the admin will get it wrong anyways. */
166 if (page) 267 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
167 return page; 268 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
168 }
169 269
170 for (;;) { 270 while (page == NULL) {
171 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); 271 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
172 272
173 drbd_kick_lo_and_reclaim_net(mdev); 273 drbd_kick_lo_and_reclaim_net(mdev);
174 274
175 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { 275 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
176 page = drbd_pp_first_page_or_try_alloc(mdev); 276 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
177 if (page) 277 if (page)
178 break; 278 break;
179 } 279 }
@@ -190,62 +290,32 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
190 } 290 }
191 finish_wait(&drbd_pp_wait, &wait); 291 finish_wait(&drbd_pp_wait, &wait);
192 292
293 if (page)
294 atomic_add(number, &mdev->pp_in_use);
193 return page; 295 return page;
194} 296}
195 297
196/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. 298/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
197 * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ 299 * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
300 * Either links the page chain back to the global pool,
301 * or returns all pages to the system. */
198static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) 302static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
199{ 303{
200 int free_it;
201
202 spin_lock(&drbd_pp_lock);
203 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
204 free_it = 1;
205 } else {
206 set_page_private(page, (unsigned long)drbd_pp_pool);
207 drbd_pp_pool = page;
208 drbd_pp_vacant++;
209 free_it = 0;
210 }
211 spin_unlock(&drbd_pp_lock);
212
213 atomic_dec(&mdev->pp_in_use);
214
215 if (free_it)
216 __free_page(page);
217
218 wake_up(&drbd_pp_wait);
219}
220
221static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
222{
223 struct page *p_to_be_freed = NULL;
224 struct page *page;
225 struct bio_vec *bvec;
226 int i; 304 int i;
227 305 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
228 spin_lock(&drbd_pp_lock); 306 i = page_chain_free(page);
229 __bio_for_each_segment(bvec, bio, i, 0) { 307 else {
230 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { 308 struct page *tmp;
231 set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); 309 tmp = page_chain_tail(page, &i);
232 p_to_be_freed = bvec->bv_page; 310 spin_lock(&drbd_pp_lock);
233 } else { 311 page_chain_add(&drbd_pp_pool, page, tmp);
234 set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); 312 drbd_pp_vacant += i;
235 drbd_pp_pool = bvec->bv_page; 313 spin_unlock(&drbd_pp_lock);
236 drbd_pp_vacant++;
237 }
238 }
239 spin_unlock(&drbd_pp_lock);
240 atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
241
242 while (p_to_be_freed) {
243 page = p_to_be_freed;
244 p_to_be_freed = (struct page *)page_private(page);
245 set_page_private(page, 0); /* just to be polite */
246 put_page(page);
247 } 314 }
248 315 atomic_sub(i, &mdev->pp_in_use);
316 i = atomic_read(&mdev->pp_in_use);
317 if (i < 0)
318 dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
249 wake_up(&drbd_pp_wait); 319 wake_up(&drbd_pp_wait);
250} 320}
251 321
@@ -270,11 +340,9 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
270 unsigned int data_size, 340 unsigned int data_size,
271 gfp_t gfp_mask) __must_hold(local) 341 gfp_t gfp_mask) __must_hold(local)
272{ 342{
273 struct request_queue *q;
274 struct drbd_epoch_entry *e; 343 struct drbd_epoch_entry *e;
275 struct page *page; 344 struct page *page;
276 struct bio *bio; 345 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
277 unsigned int ds;
278 346
279 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) 347 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
280 return NULL; 348 return NULL;
@@ -286,84 +354,32 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
286 return NULL; 354 return NULL;
287 } 355 }
288 356
289 bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); 357 page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
290 if (!bio) { 358 if (!page)
291 if (!(gfp_mask & __GFP_NOWARN)) 359 goto fail;
292 dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
293 goto fail1;
294 }
295
296 bio->bi_bdev = mdev->ldev->backing_bdev;
297 bio->bi_sector = sector;
298
299 ds = data_size;
300 while (ds) {
301 page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
302 if (!page) {
303 if (!(gfp_mask & __GFP_NOWARN))
304 dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
305 goto fail2;
306 }
307 if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
308 drbd_pp_free(mdev, page);
309 dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
310 "data_size=%u,ds=%u) failed\n",
311 (unsigned long long)sector, data_size, ds);
312
313 q = bdev_get_queue(bio->bi_bdev);
314 if (q->merge_bvec_fn) {
315 struct bvec_merge_data bvm = {
316 .bi_bdev = bio->bi_bdev,
317 .bi_sector = bio->bi_sector,
318 .bi_size = bio->bi_size,
319 .bi_rw = bio->bi_rw,
320 };
321 int l = q->merge_bvec_fn(q, &bvm,
322 &bio->bi_io_vec[bio->bi_vcnt]);
323 dev_err(DEV, "merge_bvec_fn() = %d\n", l);
324 }
325
326 /* dump more of the bio. */
327 dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
328 dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
329 dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
330 dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
331
332 goto fail2;
333 break;
334 }
335 ds -= min_t(int, ds, PAGE_SIZE);
336 }
337
338 D_ASSERT(data_size == bio->bi_size);
339
340 bio->bi_private = e;
341 e->mdev = mdev;
342 e->sector = sector;
343 e->size = bio->bi_size;
344 360
345 e->private_bio = bio;
346 e->block_id = id;
347 INIT_HLIST_NODE(&e->colision); 361 INIT_HLIST_NODE(&e->colision);
348 e->epoch = NULL; 362 e->epoch = NULL;
363 e->mdev = mdev;
364 e->pages = page;
365 atomic_set(&e->pending_bios, 0);
366 e->size = data_size;
349 e->flags = 0; 367 e->flags = 0;
368 e->sector = sector;
369 e->sector = sector;
370 e->block_id = id;
350 371
351 return e; 372 return e;
352 373
353 fail2: 374 fail:
354 drbd_pp_free_bio_pages(mdev, bio);
355 bio_put(bio);
356 fail1:
357 mempool_free(e, drbd_ee_mempool); 375 mempool_free(e, drbd_ee_mempool);
358
359 return NULL; 376 return NULL;
360} 377}
361 378
362void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 379void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
363{ 380{
364 struct bio *bio = e->private_bio; 381 drbd_pp_free(mdev, e->pages);
365 drbd_pp_free_bio_pages(mdev, bio); 382 D_ASSERT(atomic_read(&e->pending_bios) == 0);
366 bio_put(bio);
367 D_ASSERT(hlist_unhashed(&e->colision)); 383 D_ASSERT(hlist_unhashed(&e->colision));
368 mempool_free(e, drbd_ee_mempool); 384 mempool_free(e, drbd_ee_mempool);
369} 385}
@@ -902,7 +918,7 @@ retry:
902 if (!drbd_send_protocol(mdev)) 918 if (!drbd_send_protocol(mdev))
903 return -1; 919 return -1;
904 drbd_send_sync_param(mdev, &mdev->sync_conf); 920 drbd_send_sync_param(mdev, &mdev->sync_conf);
905 drbd_send_sizes(mdev, 0); 921 drbd_send_sizes(mdev, 0, 0);
906 drbd_send_uuids(mdev); 922 drbd_send_uuids(mdev);
907 drbd_send_state(mdev); 923 drbd_send_state(mdev);
908 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 924 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
@@ -946,7 +962,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
946 int rv; 962 int rv;
947 963
948 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { 964 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
949 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); 965 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
966 NULL, BLKDEV_IFL_WAIT);
950 if (rv) { 967 if (rv) {
951 dev_err(DEV, "local disk flush failed with status %d\n", rv); 968 dev_err(DEV, "local disk flush failed with status %d\n", rv);
952 /* would rather check on EOPNOTSUPP, but that is not reliable. 969 /* would rather check on EOPNOTSUPP, but that is not reliable.
@@ -1120,6 +1137,101 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1120} 1137}
1121 1138
1122/** 1139/**
1140 * drbd_submit_ee()
1141 * @mdev: DRBD device.
1142 * @e: epoch entry
1143 * @rw: flag field, see bio->bi_rw
1144 */
1145/* TODO allocate from our own bio_set. */
1146int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1147 const unsigned rw, const int fault_type)
1148{
1149 struct bio *bios = NULL;
1150 struct bio *bio;
1151 struct page *page = e->pages;
1152 sector_t sector = e->sector;
1153 unsigned ds = e->size;
1154 unsigned n_bios = 0;
1155 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1156
1157 if (atomic_read(&mdev->new_c_uuid)) {
1158 if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
1159 drbd_uuid_new_current(mdev);
1160 drbd_md_sync(mdev);
1161
1162 atomic_dec(&mdev->new_c_uuid);
1163 wake_up(&mdev->misc_wait);
1164 }
1165 wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
1166 }
1167
1168 /* In most cases, we will only need one bio. But in case the lower
1169 * level restrictions happen to be different at this offset on this
1170 * side than those of the sending peer, we may need to submit the
1171 * request in more than one bio. */
1172next_bio:
1173 bio = bio_alloc(GFP_NOIO, nr_pages);
1174 if (!bio) {
1175 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1176 goto fail;
1177 }
1178 /* > e->sector, unless this is the first bio */
1179 bio->bi_sector = sector;
1180 bio->bi_bdev = mdev->ldev->backing_bdev;
1181 /* we special case some flags in the multi-bio case, see below
1182 * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */
1183 bio->bi_rw = rw;
1184 bio->bi_private = e;
1185 bio->bi_end_io = drbd_endio_sec;
1186
1187 bio->bi_next = bios;
1188 bios = bio;
1189 ++n_bios;
1190
1191 page_chain_for_each(page) {
1192 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1193 if (!bio_add_page(bio, page, len, 0)) {
1194 /* a single page must always be possible! */
1195 BUG_ON(bio->bi_vcnt == 0);
1196 goto next_bio;
1197 }
1198 ds -= len;
1199 sector += len >> 9;
1200 --nr_pages;
1201 }
1202 D_ASSERT(page == NULL);
1203 D_ASSERT(ds == 0);
1204
1205 atomic_set(&e->pending_bios, n_bios);
1206 do {
1207 bio = bios;
1208 bios = bios->bi_next;
1209 bio->bi_next = NULL;
1210
1211 /* strip off BIO_RW_UNPLUG unless it is the last bio */
1212 if (bios)
1213 bio->bi_rw &= ~(1<<BIO_RW_UNPLUG);
1214
1215 drbd_generic_make_request(mdev, fault_type, bio);
1216
1217 /* strip off BIO_RW_BARRIER,
1218 * unless it is the first or last bio */
1219 if (bios && bios->bi_next)
1220 bios->bi_rw &= ~(1<<BIO_RW_BARRIER);
1221 } while (bios);
1222 maybe_kick_lo(mdev);
1223 return 0;
1224
1225fail:
1226 while (bios) {
1227 bio = bios;
1228 bios = bios->bi_next;
1229 bio_put(bio);
1230 }
1231 return -ENOMEM;
1232}
1233
1234/**
1123 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set 1235 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
1124 * @mdev: DRBD device. 1236 * @mdev: DRBD device.
1125 * @w: work object. 1237 * @w: work object.
@@ -1128,8 +1240,6 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1128int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) 1240int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1129{ 1241{
1130 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1242 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1131 struct bio *bio = e->private_bio;
1132
1133 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, 1243 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1134 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) 1244 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1135 so that we can finish that epoch in drbd_may_finish_epoch(). 1245 so that we can finish that epoch in drbd_may_finish_epoch().
@@ -1143,33 +1253,17 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
1143 if (previous_epoch(mdev, e->epoch)) 1253 if (previous_epoch(mdev, e->epoch))
1144 dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); 1254 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1145 1255
1146 /* prepare bio for re-submit,
1147 * re-init volatile members */
1148 /* we still have a local reference, 1256 /* we still have a local reference,
1149 * get_ldev was done in receive_Data. */ 1257 * get_ldev was done in receive_Data. */
1150 bio->bi_bdev = mdev->ldev->backing_bdev;
1151 bio->bi_sector = e->sector;
1152 bio->bi_size = e->size;
1153 bio->bi_idx = 0;
1154
1155 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1156 bio->bi_flags |= 1 << BIO_UPTODATE;
1157
1158 /* don't know whether this is necessary: */
1159 bio->bi_phys_segments = 0;
1160 bio->bi_next = NULL;
1161
1162 /* these should be unchanged: */
1163 /* bio->bi_end_io = drbd_endio_write_sec; */
1164 /* bio->bi_vcnt = whatever; */
1165 1258
1166 e->w.cb = e_end_block; 1259 e->w.cb = e_end_block;
1167 1260 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1168 /* This is no longer a barrier request. */ 1261 /* drbd_submit_ee fails for one reason only:
1169 bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); 1262 * if was not able to allocate sufficient bios.
1170 1263 * requeue, try again later. */
1171 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); 1264 e->w.cb = w_e_reissue;
1172 1265 drbd_queue_work(&mdev->data.work, &e->w);
1266 }
1173 return 1; 1267 return 1;
1174} 1268}
1175 1269
@@ -1261,13 +1355,13 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1261static struct drbd_epoch_entry * 1355static struct drbd_epoch_entry *
1262read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) 1356read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1263{ 1357{
1358 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1264 struct drbd_epoch_entry *e; 1359 struct drbd_epoch_entry *e;
1265 struct bio_vec *bvec;
1266 struct page *page; 1360 struct page *page;
1267 struct bio *bio; 1361 int dgs, ds, rr;
1268 int dgs, ds, i, rr;
1269 void *dig_in = mdev->int_dig_in; 1362 void *dig_in = mdev->int_dig_in;
1270 void *dig_vv = mdev->int_dig_vv; 1363 void *dig_vv = mdev->int_dig_vv;
1364 unsigned long *data;
1271 1365
1272 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? 1366 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1273 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; 1367 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
@@ -1286,29 +1380,44 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1286 ERR_IF(data_size & 0x1ff) return NULL; 1380 ERR_IF(data_size & 0x1ff) return NULL;
1287 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; 1381 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
1288 1382
1383 /* even though we trust out peer,
1384 * we sometimes have to double check. */
1385 if (sector + (data_size>>9) > capacity) {
1386 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
1387 (unsigned long long)capacity,
1388 (unsigned long long)sector, data_size);
1389 return NULL;
1390 }
1391
1289 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1392 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1290 * "criss-cross" setup, that might cause write-out on some other DRBD, 1393 * "criss-cross" setup, that might cause write-out on some other DRBD,
1291 * which in turn might block on the other node at this very place. */ 1394 * which in turn might block on the other node at this very place. */
1292 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); 1395 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1293 if (!e) 1396 if (!e)
1294 return NULL; 1397 return NULL;
1295 bio = e->private_bio; 1398
1296 ds = data_size; 1399 ds = data_size;
1297 bio_for_each_segment(bvec, bio, i) { 1400 page = e->pages;
1298 page = bvec->bv_page; 1401 page_chain_for_each(page) {
1299 rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); 1402 unsigned len = min_t(int, ds, PAGE_SIZE);
1403 data = kmap(page);
1404 rr = drbd_recv(mdev, data, len);
1405 if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
1406 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1407 data[0] = data[0] ^ (unsigned long)-1;
1408 }
1300 kunmap(page); 1409 kunmap(page);
1301 if (rr != min_t(int, ds, PAGE_SIZE)) { 1410 if (rr != len) {
1302 drbd_free_ee(mdev, e); 1411 drbd_free_ee(mdev, e);
1303 dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1412 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1304 rr, min_t(int, ds, PAGE_SIZE)); 1413 rr, len);
1305 return NULL; 1414 return NULL;
1306 } 1415 }
1307 ds -= rr; 1416 ds -= rr;
1308 } 1417 }
1309 1418
1310 if (dgs) { 1419 if (dgs) {
1311 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); 1420 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
1312 if (memcmp(dig_in, dig_vv, dgs)) { 1421 if (memcmp(dig_in, dig_vv, dgs)) {
1313 dev_err(DEV, "Digest integrity check FAILED.\n"); 1422 dev_err(DEV, "Digest integrity check FAILED.\n");
1314 drbd_bcast_ee(mdev, "digest failed", 1423 drbd_bcast_ee(mdev, "digest failed",
@@ -1330,7 +1439,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1330 int rr, rv = 1; 1439 int rr, rv = 1;
1331 void *data; 1440 void *data;
1332 1441
1333 page = drbd_pp_alloc(mdev, 1); 1442 if (!data_size)
1443 return TRUE;
1444
1445 page = drbd_pp_alloc(mdev, 1, 1);
1334 1446
1335 data = kmap(page); 1447 data = kmap(page);
1336 while (data_size) { 1448 while (data_size) {
@@ -1394,7 +1506,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1394 } 1506 }
1395 1507
1396 if (dgs) { 1508 if (dgs) {
1397 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); 1509 drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1398 if (memcmp(dig_in, dig_vv, dgs)) { 1510 if (memcmp(dig_in, dig_vv, dgs)) {
1399 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); 1511 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1400 return 0; 1512 return 0;
@@ -1415,7 +1527,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
1415 1527
1416 D_ASSERT(hlist_unhashed(&e->colision)); 1528 D_ASSERT(hlist_unhashed(&e->colision));
1417 1529
1418 if (likely(drbd_bio_uptodate(e->private_bio))) { 1530 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1419 drbd_set_in_sync(mdev, sector, e->size); 1531 drbd_set_in_sync(mdev, sector, e->size);
1420 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); 1532 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1421 } else { 1533 } else {
@@ -1434,30 +1546,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
1434 struct drbd_epoch_entry *e; 1546 struct drbd_epoch_entry *e;
1435 1547
1436 e = read_in_block(mdev, ID_SYNCER, sector, data_size); 1548 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1437 if (!e) { 1549 if (!e)
1438 put_ldev(mdev); 1550 goto fail;
1439 return FALSE;
1440 }
1441 1551
1442 dec_rs_pending(mdev); 1552 dec_rs_pending(mdev);
1443 1553
1444 e->private_bio->bi_end_io = drbd_endio_write_sec;
1445 e->private_bio->bi_rw = WRITE;
1446 e->w.cb = e_end_resync_block;
1447
1448 inc_unacked(mdev); 1554 inc_unacked(mdev);
1449 /* corresponding dec_unacked() in e_end_resync_block() 1555 /* corresponding dec_unacked() in e_end_resync_block()
1450 * respective _drbd_clear_done_ee */ 1556 * respective _drbd_clear_done_ee */
1451 1557
1558 e->w.cb = e_end_resync_block;
1559
1452 spin_lock_irq(&mdev->req_lock); 1560 spin_lock_irq(&mdev->req_lock);
1453 list_add(&e->w.list, &mdev->sync_ee); 1561 list_add(&e->w.list, &mdev->sync_ee);
1454 spin_unlock_irq(&mdev->req_lock); 1562 spin_unlock_irq(&mdev->req_lock);
1455 1563
1456 drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); 1564 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1457 /* accounting done in endio */ 1565 return TRUE;
1458 1566
1459 maybe_kick_lo(mdev); 1567 drbd_free_ee(mdev, e);
1460 return TRUE; 1568fail:
1569 put_ldev(mdev);
1570 return FALSE;
1461} 1571}
1462 1572
1463static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) 1573static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
@@ -1552,7 +1662,7 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1552 } 1662 }
1553 1663
1554 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { 1664 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1555 if (likely(drbd_bio_uptodate(e->private_bio))) { 1665 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1556 pcmd = (mdev->state.conn >= C_SYNC_SOURCE && 1666 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1557 mdev->state.conn <= C_PAUSED_SYNC_T && 1667 mdev->state.conn <= C_PAUSED_SYNC_T &&
1558 e->flags & EE_MAY_SET_IN_SYNC) ? 1668 e->flags & EE_MAY_SET_IN_SYNC) ?
@@ -1698,7 +1808,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1698 return FALSE; 1808 return FALSE;
1699 } 1809 }
1700 1810
1701 e->private_bio->bi_end_io = drbd_endio_write_sec;
1702 e->w.cb = e_end_block; 1811 e->w.cb = e_end_block;
1703 1812
1704 spin_lock(&mdev->epoch_lock); 1813 spin_lock(&mdev->epoch_lock);
@@ -1894,12 +2003,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1894 drbd_al_begin_io(mdev, e->sector); 2003 drbd_al_begin_io(mdev, e->sector);
1895 } 2004 }
1896 2005
1897 e->private_bio->bi_rw = rw; 2006 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
1898 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); 2007 return TRUE;
1899 /* accounting done in endio */
1900
1901 maybe_kick_lo(mdev);
1902 return TRUE;
1903 2008
1904out_interrupted: 2009out_interrupted:
1905 /* yes, the epoch_size now is imbalanced. 2010 /* yes, the epoch_size now is imbalanced.
@@ -1945,7 +2050,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1945 "no local data.\n"); 2050 "no local data.\n");
1946 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : 2051 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
1947 P_NEG_RS_DREPLY , p); 2052 P_NEG_RS_DREPLY , p);
1948 return TRUE; 2053 return drbd_drain_block(mdev, h->length - brps);
1949 } 2054 }
1950 2055
1951 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 2056 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -1957,9 +2062,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1957 return FALSE; 2062 return FALSE;
1958 } 2063 }
1959 2064
1960 e->private_bio->bi_rw = READ;
1961 e->private_bio->bi_end_io = drbd_endio_read_sec;
1962
1963 switch (h->command) { 2065 switch (h->command) {
1964 case P_DATA_REQUEST: 2066 case P_DATA_REQUEST:
1965 e->w.cb = w_e_end_data_req; 2067 e->w.cb = w_e_end_data_req;
@@ -2053,10 +2155,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2053 2155
2054 inc_unacked(mdev); 2156 inc_unacked(mdev);
2055 2157
2056 drbd_generic_make_request(mdev, fault_type, e->private_bio); 2158 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2057 maybe_kick_lo(mdev); 2159 return TRUE;
2058
2059 return TRUE;
2060 2160
2061out_free_e: 2161out_free_e:
2062 kfree(di); 2162 kfree(di);
@@ -2473,6 +2573,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2473 hg > 0 ? "source" : "target"); 2573 hg > 0 ? "source" : "target");
2474 } 2574 }
2475 2575
2576 if (abs(hg) == 100)
2577 drbd_khelper(mdev, "initial-split-brain");
2578
2476 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { 2579 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2477 int pcount = (mdev->state.role == R_PRIMARY) 2580 int pcount = (mdev->state.role == R_PRIMARY)
2478 + (peer_role == R_PRIMARY); 2581 + (peer_role == R_PRIMARY);
@@ -2518,7 +2621,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2518 * after an attempted attach on a diskless node. 2621 * after an attempted attach on a diskless node.
2519 * We just refuse to attach -- well, we drop the "connection" 2622 * We just refuse to attach -- well, we drop the "connection"
2520 * to that disk, in a way... */ 2623 * to that disk, in a way... */
2521 dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); 2624 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
2522 drbd_khelper(mdev, "split-brain"); 2625 drbd_khelper(mdev, "split-brain");
2523 return C_MASK; 2626 return C_MASK;
2524 } 2627 }
@@ -2849,7 +2952,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2849 unsigned int max_seg_s; 2952 unsigned int max_seg_s;
2850 sector_t p_size, p_usize, my_usize; 2953 sector_t p_size, p_usize, my_usize;
2851 int ldsc = 0; /* local disk size changed */ 2954 int ldsc = 0; /* local disk size changed */
2852 enum drbd_conns nconn; 2955 enum dds_flags ddsf;
2853 2956
2854 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; 2957 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2855 if (drbd_recv(mdev, h->payload, h->length) != h->length) 2958 if (drbd_recv(mdev, h->payload, h->length) != h->length)
@@ -2905,8 +3008,9 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2905 } 3008 }
2906#undef min_not_zero 3009#undef min_not_zero
2907 3010
3011 ddsf = be16_to_cpu(p->dds_flags);
2908 if (get_ldev(mdev)) { 3012 if (get_ldev(mdev)) {
2909 dd = drbd_determin_dev_size(mdev, 0); 3013 dd = drbd_determin_dev_size(mdev, ddsf);
2910 put_ldev(mdev); 3014 put_ldev(mdev);
2911 if (dd == dev_size_error) 3015 if (dd == dev_size_error)
2912 return FALSE; 3016 return FALSE;
@@ -2916,33 +3020,21 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2916 drbd_set_my_capacity(mdev, p_size); 3020 drbd_set_my_capacity(mdev, p_size);
2917 } 3021 }
2918 3022
2919 if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
2920 nconn = drbd_sync_handshake(mdev,
2921 mdev->state.peer, mdev->state.pdsk);
2922 put_ldev(mdev);
2923
2924 if (nconn == C_MASK) {
2925 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2926 return FALSE;
2927 }
2928
2929 if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
2930 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2931 return FALSE;
2932 }
2933 }
2934
2935 if (get_ldev(mdev)) { 3023 if (get_ldev(mdev)) {
2936 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 3024 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
2937 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 3025 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2938 ldsc = 1; 3026 ldsc = 1;
2939 } 3027 }
2940 3028
2941 max_seg_s = be32_to_cpu(p->max_segment_size); 3029 if (mdev->agreed_pro_version < 94)
3030 max_seg_s = be32_to_cpu(p->max_segment_size);
3031 else /* drbd 8.3.8 onwards */
3032 max_seg_s = DRBD_MAX_SEGMENT_SIZE;
3033
2942 if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) 3034 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
2943 drbd_setup_queue_param(mdev, max_seg_s); 3035 drbd_setup_queue_param(mdev, max_seg_s);
2944 3036
2945 drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); 3037 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
2946 put_ldev(mdev); 3038 put_ldev(mdev);
2947 } 3039 }
2948 3040
@@ -2951,14 +3043,17 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2951 drbd_get_capacity(mdev->this_bdev) || ldsc) { 3043 drbd_get_capacity(mdev->this_bdev) || ldsc) {
2952 /* we have different sizes, probably peer 3044 /* we have different sizes, probably peer
2953 * needs to know my new size... */ 3045 * needs to know my new size... */
2954 drbd_send_sizes(mdev, 0); 3046 drbd_send_sizes(mdev, 0, ddsf);
2955 } 3047 }
2956 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || 3048 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
2957 (dd == grew && mdev->state.conn == C_CONNECTED)) { 3049 (dd == grew && mdev->state.conn == C_CONNECTED)) {
2958 if (mdev->state.pdsk >= D_INCONSISTENT && 3050 if (mdev->state.pdsk >= D_INCONSISTENT &&
2959 mdev->state.disk >= D_INCONSISTENT) 3051 mdev->state.disk >= D_INCONSISTENT) {
2960 resync_after_online_grow(mdev); 3052 if (ddsf & DDSF_NO_RESYNC)
2961 else 3053 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3054 else
3055 resync_after_online_grow(mdev);
3056 } else
2962 set_bit(RESYNC_AFTER_NEG, &mdev->flags); 3057 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
2963 } 3058 }
2964 } 3059 }
@@ -3490,6 +3585,92 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3490 return TRUE; 3585 return TRUE;
3491} 3586}
3492 3587
3588static void timeval_sub_us(struct timeval* tv, unsigned int us)
3589{
3590 tv->tv_sec -= us / 1000000;
3591 us = us % 1000000;
3592 if (tv->tv_usec > us) {
3593 tv->tv_usec += 1000000;
3594 tv->tv_sec--;
3595 }
3596 tv->tv_usec -= us;
3597}
3598
3599static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p)
3600{
3601 struct delay_probe *dp;
3602 struct list_head *le;
3603 struct timeval now;
3604 int seq_num;
3605 int offset;
3606 int data_delay;
3607
3608 seq_num = be32_to_cpu(p->seq_num);
3609 offset = be32_to_cpu(p->offset);
3610
3611 spin_lock(&mdev->peer_seq_lock);
3612 if (!list_empty(&mdev->delay_probes)) {
3613 if (from == USE_DATA_SOCKET)
3614 le = mdev->delay_probes.next;
3615 else
3616 le = mdev->delay_probes.prev;
3617
3618 dp = list_entry(le, struct delay_probe, list);
3619
3620 if (dp->seq_num == seq_num) {
3621 list_del(le);
3622 spin_unlock(&mdev->peer_seq_lock);
3623 do_gettimeofday(&now);
3624 timeval_sub_us(&now, offset);
3625 data_delay =
3626 now.tv_usec - dp->time.tv_usec +
3627 (now.tv_sec - dp->time.tv_sec) * 1000000;
3628
3629 if (data_delay > 0)
3630 mdev->data_delay = data_delay;
3631
3632 kfree(dp);
3633 return;
3634 }
3635
3636 if (dp->seq_num > seq_num) {
3637 spin_unlock(&mdev->peer_seq_lock);
3638 dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n");
3639 return; /* Do not alloca a struct delay_probe.... */
3640 }
3641 }
3642 spin_unlock(&mdev->peer_seq_lock);
3643
3644 dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO);
3645 if (!dp) {
3646 dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n");
3647 return;
3648 }
3649
3650 dp->seq_num = seq_num;
3651 do_gettimeofday(&dp->time);
3652 timeval_sub_us(&dp->time, offset);
3653
3654 spin_lock(&mdev->peer_seq_lock);
3655 if (from == USE_DATA_SOCKET)
3656 list_add(&dp->list, &mdev->delay_probes);
3657 else
3658 list_add_tail(&dp->list, &mdev->delay_probes);
3659 spin_unlock(&mdev->peer_seq_lock);
3660}
3661
3662static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h)
3663{
3664 struct p_delay_probe *p = (struct p_delay_probe *)h;
3665
3666 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3667 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3668 return FALSE;
3669
3670 got_delay_probe(mdev, USE_DATA_SOCKET, p);
3671 return TRUE;
3672}
3673
3493typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); 3674typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
3494 3675
3495static drbd_cmd_handler_f drbd_default_handler[] = { 3676static drbd_cmd_handler_f drbd_default_handler[] = {
@@ -3513,6 +3694,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
3513 [P_OV_REQUEST] = receive_DataRequest, 3694 [P_OV_REQUEST] = receive_DataRequest,
3514 [P_OV_REPLY] = receive_DataRequest, 3695 [P_OV_REPLY] = receive_DataRequest,
3515 [P_CSUM_RS_REQUEST] = receive_DataRequest, 3696 [P_CSUM_RS_REQUEST] = receive_DataRequest,
3697 [P_DELAY_PROBE] = receive_delay_probe,
3516 /* anything missing from this table is in 3698 /* anything missing from this table is in
3517 * the asender_tbl, see get_asender_cmd */ 3699 * the asender_tbl, see get_asender_cmd */
3518 [P_MAX_CMD] = NULL, 3700 [P_MAX_CMD] = NULL,
@@ -3739,7 +3921,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3739 dev_info(DEV, "net_ee not empty, killed %u entries\n", i); 3921 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3740 i = atomic_read(&mdev->pp_in_use); 3922 i = atomic_read(&mdev->pp_in_use);
3741 if (i) 3923 if (i)
3742 dev_info(DEV, "pp_in_use = %u, expected 0\n", i); 3924 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
3743 3925
3744 D_ASSERT(list_empty(&mdev->read_ee)); 3926 D_ASSERT(list_empty(&mdev->read_ee));
3745 D_ASSERT(list_empty(&mdev->active_ee)); 3927 D_ASSERT(list_empty(&mdev->active_ee));
@@ -4232,7 +4414,6 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4232 4414
4233 sector = be64_to_cpu(p->sector); 4415 sector = be64_to_cpu(p->sector);
4234 size = be32_to_cpu(p->blksize); 4416 size = be32_to_cpu(p->blksize);
4235 D_ASSERT(p->block_id == ID_SYNCER);
4236 4417
4237 update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4418 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4238 4419
@@ -4290,6 +4471,14 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4290 return TRUE; 4471 return TRUE;
4291} 4472}
4292 4473
4474static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h)
4475{
4476 struct p_delay_probe *p = (struct p_delay_probe *)h;
4477
4478 got_delay_probe(mdev, USE_META_SOCKET, p);
4479 return TRUE;
4480}
4481
4293struct asender_cmd { 4482struct asender_cmd {
4294 size_t pkt_size; 4483 size_t pkt_size;
4295 int (*process)(struct drbd_conf *mdev, struct p_header *h); 4484 int (*process)(struct drbd_conf *mdev, struct p_header *h);
@@ -4314,6 +4503,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4314 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, 4503 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4315 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 4504 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4316 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 4505 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4506 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_delay_probe_m },
4317 [P_MAX_CMD] = { 0, NULL }, 4507 [P_MAX_CMD] = { 0, NULL },
4318 }; 4508 };
4319 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) 4509 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index de81ab7b4627..3397f11d0ba9 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -722,6 +722,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
722 struct drbd_request *req; 722 struct drbd_request *req;
723 int local, remote; 723 int local, remote;
724 int err = -EIO; 724 int err = -EIO;
725 int ret = 0;
725 726
726 /* allocate outside of all locks; */ 727 /* allocate outside of all locks; */
727 req = drbd_req_new(mdev, bio); 728 req = drbd_req_new(mdev, bio);
@@ -784,7 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
784 (mdev->state.pdsk == D_INCONSISTENT && 785 (mdev->state.pdsk == D_INCONSISTENT &&
785 mdev->state.conn >= C_CONNECTED)); 786 mdev->state.conn >= C_CONNECTED));
786 787
787 if (!(local || remote)) { 788 if (!(local || remote) && !mdev->state.susp) {
788 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 789 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
789 goto fail_free_complete; 790 goto fail_free_complete;
790 } 791 }
@@ -810,6 +811,16 @@ allocate_barrier:
810 /* GOOD, everything prepared, grab the spin_lock */ 811 /* GOOD, everything prepared, grab the spin_lock */
811 spin_lock_irq(&mdev->req_lock); 812 spin_lock_irq(&mdev->req_lock);
812 813
814 if (mdev->state.susp) {
815 /* If we got suspended, use the retry mechanism of
816 generic_make_request() to restart processing of this
817 bio. In the next call to drbd_make_request_26
818 we sleep in inc_ap_bio() */
819 ret = 1;
820 spin_unlock_irq(&mdev->req_lock);
821 goto fail_free_complete;
822 }
823
813 if (remote) { 824 if (remote) {
814 remote = (mdev->state.pdsk == D_UP_TO_DATE || 825 remote = (mdev->state.pdsk == D_UP_TO_DATE ||
815 (mdev->state.pdsk == D_INCONSISTENT && 826 (mdev->state.pdsk == D_INCONSISTENT &&
@@ -947,12 +958,14 @@ fail_and_free_req:
947 req->private_bio = NULL; 958 req->private_bio = NULL;
948 put_ldev(mdev); 959 put_ldev(mdev);
949 } 960 }
950 bio_endio(bio, err); 961 if (!ret)
962 bio_endio(bio, err);
963
951 drbd_req_free(req); 964 drbd_req_free(req);
952 dec_ap_bio(mdev); 965 dec_ap_bio(mdev);
953 kfree(b); 966 kfree(b);
954 967
955 return 0; 968 return ret;
956} 969}
957 970
958/* helper function for drbd_make_request 971/* helper function for drbd_make_request
@@ -962,11 +975,6 @@ fail_and_free_req:
962 */ 975 */
963static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) 976static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
964{ 977{
965 /* Unconfigured */
966 if (mdev->state.conn == C_DISCONNECTING &&
967 mdev->state.disk == D_DISKLESS)
968 return 1;
969
970 if (mdev->state.role != R_PRIMARY && 978 if (mdev->state.role != R_PRIMARY &&
971 (!allow_oos || is_write)) { 979 (!allow_oos || is_write)) {
972 if (__ratelimit(&drbd_ratelimit_state)) { 980 if (__ratelimit(&drbd_ratelimit_state)) {
@@ -1070,15 +1078,21 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1070 1078
1071 /* we need to get a "reference count" (ap_bio_cnt) 1079 /* we need to get a "reference count" (ap_bio_cnt)
1072 * to avoid races with the disconnect/reconnect/suspend code. 1080 * to avoid races with the disconnect/reconnect/suspend code.
1073 * In case we need to split the bio here, we need to get two references 1081 * In case we need to split the bio here, we need to get three references
1074 * atomically, otherwise we might deadlock when trying to submit the 1082 * atomically, otherwise we might deadlock when trying to submit the
1075 * second one! */ 1083 * second one! */
1076 inc_ap_bio(mdev, 2); 1084 inc_ap_bio(mdev, 3);
1077 1085
1078 D_ASSERT(e_enr == s_enr + 1); 1086 D_ASSERT(e_enr == s_enr + 1);
1079 1087
1080 drbd_make_request_common(mdev, &bp->bio1); 1088 while (drbd_make_request_common(mdev, &bp->bio1))
1081 drbd_make_request_common(mdev, &bp->bio2); 1089 inc_ap_bio(mdev, 1);
1090
1091 while (drbd_make_request_common(mdev, &bp->bio2))
1092 inc_ap_bio(mdev, 1);
1093
1094 dec_ap_bio(mdev);
1095
1082 bio_pair_release(bp); 1096 bio_pair_release(bp);
1083 } 1097 }
1084 return 0; 1098 return 0;
@@ -1115,7 +1129,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1115 } else if (limit && get_ldev(mdev)) { 1129 } else if (limit && get_ldev(mdev)) {
1116 struct request_queue * const b = 1130 struct request_queue * const b =
1117 mdev->ldev->backing_bdev->bd_disk->queue; 1131 mdev->ldev->backing_bdev->bd_disk->queue;
1118 if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { 1132 if (b->merge_bvec_fn) {
1119 backing_limit = b->merge_bvec_fn(b, bvm, bvec); 1133 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1120 limit = min(limit, backing_limit); 1134 limit = min(limit, backing_limit);
1121 } 1135 }
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 76863e3f05be..85179e1fb50a 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -70,7 +70,7 @@ static const char *drbd_disk_s_names[] = {
70 70
71static const char *drbd_state_sw_errors[] = { 71static const char *drbd_state_sw_errors[] = {
72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", 72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
73 [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", 73 [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", 74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", 75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", 76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d48a1dfd7b24..727ff6339754 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -47,8 +47,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
47 47
48/* defined here: 48/* defined here:
49 drbd_md_io_complete 49 drbd_md_io_complete
50 drbd_endio_write_sec 50 drbd_endio_sec
51 drbd_endio_read_sec
52 drbd_endio_pri 51 drbd_endio_pri
53 52
54 * more endio handlers: 53 * more endio handlers:
@@ -85,27 +84,10 @@ void drbd_md_io_complete(struct bio *bio, int error)
85/* reads on behalf of the partner, 84/* reads on behalf of the partner,
86 * "submitted" by the receiver 85 * "submitted" by the receiver
87 */ 86 */
88void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) 87void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
89{ 88{
90 unsigned long flags = 0; 89 unsigned long flags = 0;
91 struct drbd_epoch_entry *e = NULL; 90 struct drbd_conf *mdev = e->mdev;
92 struct drbd_conf *mdev;
93 int uptodate = bio_flagged(bio, BIO_UPTODATE);
94
95 e = bio->bi_private;
96 mdev = e->mdev;
97
98 if (error)
99 dev_warn(DEV, "read: error=%d s=%llus\n", error,
100 (unsigned long long)e->sector);
101 if (!error && !uptodate) {
102 dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
103 (unsigned long long)e->sector);
104 /* strange behavior of some lower level drivers...
105 * fail the request by clearing the uptodate flag,
106 * but do not return any error?! */
107 error = -EIO;
108 }
109 91
110 D_ASSERT(e->block_id != ID_VACANT); 92 D_ASSERT(e->block_id != ID_VACANT);
111 93
@@ -114,49 +96,38 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
114 list_del(&e->w.list); 96 list_del(&e->w.list);
115 if (list_empty(&mdev->read_ee)) 97 if (list_empty(&mdev->read_ee))
116 wake_up(&mdev->ee_wait); 98 wake_up(&mdev->ee_wait);
99 if (test_bit(__EE_WAS_ERROR, &e->flags))
100 __drbd_chk_io_error(mdev, FALSE);
117 spin_unlock_irqrestore(&mdev->req_lock, flags); 101 spin_unlock_irqrestore(&mdev->req_lock, flags);
118 102
119 drbd_chk_io_error(mdev, error, FALSE);
120 drbd_queue_work(&mdev->data.work, &e->w); 103 drbd_queue_work(&mdev->data.work, &e->w);
121 put_ldev(mdev); 104 put_ldev(mdev);
122} 105}
123 106
107static int is_failed_barrier(int ee_flags)
108{
109 return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
110 == (EE_IS_BARRIER|EE_WAS_ERROR);
111}
112
124/* writes on behalf of the partner, or resync writes, 113/* writes on behalf of the partner, or resync writes,
125 * "submitted" by the receiver. 114 * "submitted" by the receiver, final stage. */
126 */ 115static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
127void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
128{ 116{
129 unsigned long flags = 0; 117 unsigned long flags = 0;
130 struct drbd_epoch_entry *e = NULL; 118 struct drbd_conf *mdev = e->mdev;
131 struct drbd_conf *mdev;
132 sector_t e_sector; 119 sector_t e_sector;
133 int do_wake; 120 int do_wake;
134 int is_syncer_req; 121 int is_syncer_req;
135 int do_al_complete_io; 122 int do_al_complete_io;
136 int uptodate = bio_flagged(bio, BIO_UPTODATE);
137 int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
138
139 e = bio->bi_private;
140 mdev = e->mdev;
141 123
142 if (error) 124 /* if this is a failed barrier request, disable use of barriers,
143 dev_warn(DEV, "write: error=%d s=%llus\n", error, 125 * and schedule for resubmission */
144 (unsigned long long)e->sector); 126 if (is_failed_barrier(e->flags)) {
145 if (!error && !uptodate) {
146 dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
147 (unsigned long long)e->sector);
148 /* strange behavior of some lower level drivers...
149 * fail the request by clearing the uptodate flag,
150 * but do not return any error?! */
151 error = -EIO;
152 }
153
154 /* error == -ENOTSUPP would be a better test,
155 * alas it is not reliable */
156 if (error && is_barrier && e->flags & EE_IS_BARRIER) {
157 drbd_bump_write_ordering(mdev, WO_bdev_flush); 127 drbd_bump_write_ordering(mdev, WO_bdev_flush);
158 spin_lock_irqsave(&mdev->req_lock, flags); 128 spin_lock_irqsave(&mdev->req_lock, flags);
159 list_del(&e->w.list); 129 list_del(&e->w.list);
130 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
160 e->w.cb = w_e_reissue; 131 e->w.cb = w_e_reissue;
161 /* put_ldev actually happens below, once we come here again. */ 132 /* put_ldev actually happens below, once we come here again. */
162 __release(local); 133 __release(local);
@@ -167,17 +138,16 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
167 138
168 D_ASSERT(e->block_id != ID_VACANT); 139 D_ASSERT(e->block_id != ID_VACANT);
169 140
170 spin_lock_irqsave(&mdev->req_lock, flags);
171 mdev->writ_cnt += e->size >> 9;
172 is_syncer_req = is_syncer_block_id(e->block_id);
173
174 /* after we moved e to done_ee, 141 /* after we moved e to done_ee,
175 * we may no longer access it, 142 * we may no longer access it,
176 * it may be freed/reused already! 143 * it may be freed/reused already!
177 * (as soon as we release the req_lock) */ 144 * (as soon as we release the req_lock) */
178 e_sector = e->sector; 145 e_sector = e->sector;
179 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; 146 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
147 is_syncer_req = is_syncer_block_id(e->block_id);
180 148
149 spin_lock_irqsave(&mdev->req_lock, flags);
150 mdev->writ_cnt += e->size >> 9;
181 list_del(&e->w.list); /* has been on active_ee or sync_ee */ 151 list_del(&e->w.list); /* has been on active_ee or sync_ee */
182 list_add_tail(&e->w.list, &mdev->done_ee); 152 list_add_tail(&e->w.list, &mdev->done_ee);
183 153
@@ -190,7 +160,7 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
190 ? list_empty(&mdev->sync_ee) 160 ? list_empty(&mdev->sync_ee)
191 : list_empty(&mdev->active_ee); 161 : list_empty(&mdev->active_ee);
192 162
193 if (error) 163 if (test_bit(__EE_WAS_ERROR, &e->flags))
194 __drbd_chk_io_error(mdev, FALSE); 164 __drbd_chk_io_error(mdev, FALSE);
195 spin_unlock_irqrestore(&mdev->req_lock, flags); 165 spin_unlock_irqrestore(&mdev->req_lock, flags);
196 166
@@ -205,7 +175,42 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
205 175
206 wake_asender(mdev); 176 wake_asender(mdev);
207 put_ldev(mdev); 177 put_ldev(mdev);
178}
179
180/* writes on behalf of the partner, or resync writes,
181 * "submitted" by the receiver.
182 */
183void drbd_endio_sec(struct bio *bio, int error)
184{
185 struct drbd_epoch_entry *e = bio->bi_private;
186 struct drbd_conf *mdev = e->mdev;
187 int uptodate = bio_flagged(bio, BIO_UPTODATE);
188 int is_write = bio_data_dir(bio) == WRITE;
189
190 if (error)
191 dev_warn(DEV, "%s: error=%d s=%llus\n",
192 is_write ? "write" : "read", error,
193 (unsigned long long)e->sector);
194 if (!error && !uptodate) {
195 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
196 is_write ? "write" : "read",
197 (unsigned long long)e->sector);
198 /* strange behavior of some lower level drivers...
199 * fail the request by clearing the uptodate flag,
200 * but do not return any error?! */
201 error = -EIO;
202 }
203
204 if (error)
205 set_bit(__EE_WAS_ERROR, &e->flags);
208 206
207 bio_put(bio); /* no need for the bio anymore */
208 if (atomic_dec_and_test(&e->pending_bios)) {
209 if (is_write)
210 drbd_endio_write_sec_final(e);
211 else
212 drbd_endio_read_sec_final(e);
213 }
209} 214}
210 215
211/* read, readA or write requests on R_PRIMARY coming from drbd_make_request 216/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
@@ -295,7 +300,34 @@ int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
295 return 1; /* Simply ignore this! */ 300 return 1; /* Simply ignore this! */
296} 301}
297 302
298void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) 303void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
304{
305 struct hash_desc desc;
306 struct scatterlist sg;
307 struct page *page = e->pages;
308 struct page *tmp;
309 unsigned len;
310
311 desc.tfm = tfm;
312 desc.flags = 0;
313
314 sg_init_table(&sg, 1);
315 crypto_hash_init(&desc);
316
317 while ((tmp = page_chain_next(page))) {
318 /* all but the last page will be fully used */
319 sg_set_page(&sg, page, PAGE_SIZE, 0);
320 crypto_hash_update(&desc, &sg, sg.length);
321 page = tmp;
322 }
323 /* and now the last, possibly only partially used page */
324 len = e->size & (PAGE_SIZE - 1);
325 sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
326 crypto_hash_update(&desc, &sg, sg.length);
327 crypto_hash_final(&desc, digest);
328}
329
330void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
299{ 331{
300 struct hash_desc desc; 332 struct hash_desc desc;
301 struct scatterlist sg; 333 struct scatterlist sg;
@@ -329,11 +361,11 @@ static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel
329 return 1; 361 return 1;
330 } 362 }
331 363
332 if (likely(drbd_bio_uptodate(e->private_bio))) { 364 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
333 digest_size = crypto_hash_digestsize(mdev->csums_tfm); 365 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
334 digest = kmalloc(digest_size, GFP_NOIO); 366 digest = kmalloc(digest_size, GFP_NOIO);
335 if (digest) { 367 if (digest) {
336 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); 368 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
337 369
338 inc_rs_pending(mdev); 370 inc_rs_pending(mdev);
339 ok = drbd_send_drequest_csum(mdev, 371 ok = drbd_send_drequest_csum(mdev,
@@ -369,23 +401,21 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
369 /* GFP_TRY, because if there is no memory available right now, this may 401 /* GFP_TRY, because if there is no memory available right now, this may
370 * be rescheduled for later. It is "only" background resync, after all. */ 402 * be rescheduled for later. It is "only" background resync, after all. */
371 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); 403 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
372 if (!e) { 404 if (!e)
373 put_ldev(mdev); 405 goto fail;
374 return 2;
375 }
376 406
377 spin_lock_irq(&mdev->req_lock); 407 spin_lock_irq(&mdev->req_lock);
378 list_add(&e->w.list, &mdev->read_ee); 408 list_add(&e->w.list, &mdev->read_ee);
379 spin_unlock_irq(&mdev->req_lock); 409 spin_unlock_irq(&mdev->req_lock);
380 410
381 e->private_bio->bi_end_io = drbd_endio_read_sec;
382 e->private_bio->bi_rw = READ;
383 e->w.cb = w_e_send_csum; 411 e->w.cb = w_e_send_csum;
412 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
413 return 1;
384 414
385 mdev->read_cnt += size >> 9; 415 drbd_free_ee(mdev, e);
386 drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); 416fail:
387 417 put_ldev(mdev);
388 return 1; 418 return 2;
389} 419}
390 420
391void resync_timer_fn(unsigned long data) 421void resync_timer_fn(unsigned long data)
@@ -414,13 +444,25 @@ void resync_timer_fn(unsigned long data)
414 drbd_queue_work(&mdev->data.work, &mdev->resync_work); 444 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
415} 445}
416 446
447static int calc_resync_rate(struct drbd_conf *mdev)
448{
449 int d = mdev->data_delay / 1000; /* us -> ms */
450 int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */
451 int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */
452 int cr = mdev->sync_conf.rate;
453
454 return d <= td ? cr :
455 d >= hd ? 0 :
456 cr + (cr * (td - d) / (hd - td));
457}
458
417int w_make_resync_request(struct drbd_conf *mdev, 459int w_make_resync_request(struct drbd_conf *mdev,
418 struct drbd_work *w, int cancel) 460 struct drbd_work *w, int cancel)
419{ 461{
420 unsigned long bit; 462 unsigned long bit;
421 sector_t sector; 463 sector_t sector;
422 const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 464 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
423 int max_segment_size = queue_max_segment_size(mdev->rq_queue); 465 int max_segment_size;
424 int number, i, size, pe, mx; 466 int number, i, size, pe, mx;
425 int align, queued, sndbuf; 467 int align, queued, sndbuf;
426 468
@@ -446,7 +488,13 @@ int w_make_resync_request(struct drbd_conf *mdev,
446 return 1; 488 return 1;
447 } 489 }
448 490
449 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); 491 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
492 * if it should be necessary */
493 max_segment_size = mdev->agreed_pro_version < 94 ?
494 queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
495
496 mdev->c_sync_rate = calc_resync_rate(mdev);
497 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
450 pe = atomic_read(&mdev->rs_pending_cnt); 498 pe = atomic_read(&mdev->rs_pending_cnt);
451 499
452 mutex_lock(&mdev->data.mutex); 500 mutex_lock(&mdev->data.mutex);
@@ -509,12 +557,6 @@ next_sector:
509 * 557 *
510 * Additionally always align bigger requests, in order to 558 * Additionally always align bigger requests, in order to
511 * be prepared for all stripe sizes of software RAIDs. 559 * be prepared for all stripe sizes of software RAIDs.
512 *
513 * we _do_ care about the agreed-upon q->max_segment_size
514 * here, as splitting up the requests on the other side is more
515 * difficult. the consequence is, that on lvm and md and other
516 * "indirect" devices, this is dead code, since
517 * q->max_segment_size will be PAGE_SIZE.
518 */ 560 */
519 align = 1; 561 align = 1;
520 for (;;) { 562 for (;;) {
@@ -806,7 +848,7 @@ out:
806/* helper */ 848/* helper */
807static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 849static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
808{ 850{
809 if (drbd_bio_has_active_page(e->private_bio)) { 851 if (drbd_ee_has_active_page(e)) {
810 /* This might happen if sendpage() has not finished */ 852 /* This might happen if sendpage() has not finished */
811 spin_lock_irq(&mdev->req_lock); 853 spin_lock_irq(&mdev->req_lock);
812 list_add_tail(&e->w.list, &mdev->net_ee); 854 list_add_tail(&e->w.list, &mdev->net_ee);
@@ -832,7 +874,7 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
832 return 1; 874 return 1;
833 } 875 }
834 876
835 if (likely(drbd_bio_uptodate(e->private_bio))) { 877 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
836 ok = drbd_send_block(mdev, P_DATA_REPLY, e); 878 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
837 } else { 879 } else {
838 if (__ratelimit(&drbd_ratelimit_state)) 880 if (__ratelimit(&drbd_ratelimit_state))
@@ -873,7 +915,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
873 put_ldev(mdev); 915 put_ldev(mdev);
874 } 916 }
875 917
876 if (likely(drbd_bio_uptodate(e->private_bio))) { 918 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
877 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { 919 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
878 inc_rs_pending(mdev); 920 inc_rs_pending(mdev);
879 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 921 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
@@ -921,7 +963,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
921 963
922 di = (struct digest_info *)(unsigned long)e->block_id; 964 di = (struct digest_info *)(unsigned long)e->block_id;
923 965
924 if (likely(drbd_bio_uptodate(e->private_bio))) { 966 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
925 /* quick hack to try to avoid a race against reconfiguration. 967 /* quick hack to try to avoid a race against reconfiguration.
926 * a real fix would be much more involved, 968 * a real fix would be much more involved,
927 * introducing more locking mechanisms */ 969 * introducing more locking mechanisms */
@@ -931,7 +973,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
931 digest = kmalloc(digest_size, GFP_NOIO); 973 digest = kmalloc(digest_size, GFP_NOIO);
932 } 974 }
933 if (digest) { 975 if (digest) {
934 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); 976 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
935 eq = !memcmp(digest, di->digest, digest_size); 977 eq = !memcmp(digest, di->digest, digest_size);
936 kfree(digest); 978 kfree(digest);
937 } 979 }
@@ -973,14 +1015,14 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
973 if (unlikely(cancel)) 1015 if (unlikely(cancel))
974 goto out; 1016 goto out;
975 1017
976 if (unlikely(!drbd_bio_uptodate(e->private_bio))) 1018 if (unlikely((e->flags & EE_WAS_ERROR) != 0))
977 goto out; 1019 goto out;
978 1020
979 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1021 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
980 /* FIXME if this allocation fails, online verify will not terminate! */ 1022 /* FIXME if this allocation fails, online verify will not terminate! */
981 digest = kmalloc(digest_size, GFP_NOIO); 1023 digest = kmalloc(digest_size, GFP_NOIO);
982 if (digest) { 1024 if (digest) {
983 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); 1025 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
984 inc_rs_pending(mdev); 1026 inc_rs_pending(mdev);
985 ok = drbd_send_drequest_csum(mdev, e->sector, e->size, 1027 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
986 digest, digest_size, P_OV_REPLY); 1028 digest, digest_size, P_OV_REPLY);
@@ -1029,11 +1071,11 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1029 1071
1030 di = (struct digest_info *)(unsigned long)e->block_id; 1072 di = (struct digest_info *)(unsigned long)e->block_id;
1031 1073
1032 if (likely(drbd_bio_uptodate(e->private_bio))) { 1074 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1033 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1075 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1034 digest = kmalloc(digest_size, GFP_NOIO); 1076 digest = kmalloc(digest_size, GFP_NOIO);
1035 if (digest) { 1077 if (digest) {
1036 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); 1078 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
1037 1079
1038 D_ASSERT(digest_size == di->digest_size); 1080 D_ASSERT(digest_size == di->digest_size);
1039 eq = !memcmp(digest, di->digest, digest_size); 1081 eq = !memcmp(digest, di->digest, digest_size);
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index f93fa111ce50..defdb5013ea3 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -18,23 +18,9 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
18 18
19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) 19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
20 20
21static inline int drbd_bio_has_active_page(struct bio *bio)
22{
23 struct bio_vec *bvec;
24 int i;
25
26 __bio_for_each_segment(bvec, bio, i, 0) {
27 if (page_count(bvec->bv_page) > 1)
28 return 1;
29 }
30
31 return 0;
32}
33
34/* bi_end_io handlers */ 21/* bi_end_io handlers */
35extern void drbd_md_io_complete(struct bio *bio, int error); 22extern void drbd_md_io_complete(struct bio *bio, int error);
36extern void drbd_endio_read_sec(struct bio *bio, int error); 23extern void drbd_endio_sec(struct bio *bio, int error);
37extern void drbd_endio_write_sec(struct bio *bio, int error);
38extern void drbd_endio_pri(struct bio *bio, int error); 24extern void drbd_endio_pri(struct bio *bio, int error);
39 25
40/* 26/*
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 3b128dce9c3a..33d65039cce9 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -407,32 +407,24 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
407 return 0; 407 return 0;
408} 408}
409 409
410static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity) 410static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
411{ 411{
412 u64 set = min(capacity, drive->probed_capacity);
413 u16 *id = drive->id; 412 u16 *id = drive->id;
414 int lba48 = ata_id_lba48_enabled(id); 413 int lba48 = ata_id_lba48_enabled(id);
415 414
416 if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 || 415 if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
417 ata_id_hpa_enabled(id) == 0) 416 ata_id_hpa_enabled(id) == 0)
418 goto out; 417 return;
419 418
420 /* 419 /*
421 * according to the spec the SET MAX ADDRESS command shall be 420 * according to the spec the SET MAX ADDRESS command shall be
422 * immediately preceded by a READ NATIVE MAX ADDRESS command 421 * immediately preceded by a READ NATIVE MAX ADDRESS command
423 */ 422 */
424 capacity = ide_disk_hpa_get_native_capacity(drive, lba48); 423 if (!ide_disk_hpa_get_native_capacity(drive, lba48))
425 if (capacity == 0) 424 return;
426 goto out; 425
427 426 if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
428 set = ide_disk_hpa_set_capacity(drive, set, lba48); 427 drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
429 if (set) {
430 /* needed for ->resume to disable HPA */
431 drive->dev_flags |= IDE_DFLAG_NOHPA;
432 return set;
433 }
434out:
435 return drive->capacity64;
436} 428}
437 429
438static void idedisk_prepare_flush(struct request_queue *q, struct request *rq) 430static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
@@ -783,13 +775,13 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
783} 775}
784 776
785const struct ide_disk_ops ide_ata_disk_ops = { 777const struct ide_disk_ops ide_ata_disk_ops = {
786 .check = ide_disk_check, 778 .check = ide_disk_check,
787 .set_capacity = ide_disk_set_capacity, 779 .unlock_native_capacity = ide_disk_unlock_native_capacity,
788 .get_capacity = ide_disk_get_capacity, 780 .get_capacity = ide_disk_get_capacity,
789 .setup = ide_disk_setup, 781 .setup = ide_disk_setup,
790 .flush = ide_disk_flush, 782 .flush = ide_disk_flush,
791 .init_media = ide_disk_init_media, 783 .init_media = ide_disk_init_media,
792 .set_doorlock = ide_disk_set_doorlock, 784 .set_doorlock = ide_disk_set_doorlock,
793 .do_request = ide_do_rw_disk, 785 .do_request = ide_do_rw_disk,
794 .ioctl = ide_disk_ioctl, 786 .ioctl = ide_disk_ioctl,
795}; 787};
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index c32d83996ae1..c102d23d9b38 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -288,17 +288,14 @@ static int ide_gd_media_changed(struct gendisk *disk)
288 return ret; 288 return ret;
289} 289}
290 290
291static unsigned long long ide_gd_set_capacity(struct gendisk *disk, 291static void ide_gd_unlock_native_capacity(struct gendisk *disk)
292 unsigned long long capacity)
293{ 292{
294 struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); 293 struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
295 ide_drive_t *drive = idkp->drive; 294 ide_drive_t *drive = idkp->drive;
296 const struct ide_disk_ops *disk_ops = drive->disk_ops; 295 const struct ide_disk_ops *disk_ops = drive->disk_ops;
297 296
298 if (disk_ops->set_capacity) 297 if (disk_ops->unlock_native_capacity)
299 return disk_ops->set_capacity(drive, capacity); 298 disk_ops->unlock_native_capacity(drive);
300
301 return drive->capacity64;
302} 299}
303 300
304static int ide_gd_revalidate_disk(struct gendisk *disk) 301static int ide_gd_revalidate_disk(struct gendisk *disk)
@@ -329,7 +326,7 @@ static const struct block_device_operations ide_gd_ops = {
329 .locked_ioctl = ide_gd_ioctl, 326 .locked_ioctl = ide_gd_ioctl,
330 .getgeo = ide_gd_getgeo, 327 .getgeo = ide_gd_getgeo,
331 .media_changed = ide_gd_media_changed, 328 .media_changed = ide_gd_media_changed,
332 .set_capacity = ide_gd_set_capacity, 329 .unlock_native_capacity = ide_gd_unlock_native_capacity,
333 .revalidate_disk = ide_gd_revalidate_disk 330 .revalidate_disk = ide_gd_revalidate_disk
334}; 331};
335 332
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..55dcb7884f4d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -417,7 +417,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
417 */ 417 */
418 mutex_unlock(&bd_inode->i_mutex); 418 mutex_unlock(&bd_inode->i_mutex);
419 419
420 error = blkdev_issue_flush(bdev, NULL); 420 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
421 if (error == -EOPNOTSUPP) 421 if (error == -EOPNOTSUPP)
422 error = 0; 422 error = 0;
423 423
@@ -668,41 +668,209 @@ void bd_forget(struct inode *inode)
668 iput(bdev->bd_inode); 668 iput(bdev->bd_inode);
669} 669}
670 670
671int bd_claim(struct block_device *bdev, void *holder) 671/**
672 * bd_may_claim - test whether a block device can be claimed
673 * @bdev: block device of interest
674 * @whole: whole block device containing @bdev, may equal @bdev
675 * @holder: holder trying to claim @bdev
676 *
677 * Test whther @bdev can be claimed by @holder.
678 *
679 * CONTEXT:
680 * spin_lock(&bdev_lock).
681 *
682 * RETURNS:
683 * %true if @bdev can be claimed, %false otherwise.
684 */
685static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
686 void *holder)
672{ 687{
673 int res;
674 spin_lock(&bdev_lock);
675
676 /* first decide result */
677 if (bdev->bd_holder == holder) 688 if (bdev->bd_holder == holder)
678 res = 0; /* already a holder */ 689 return true; /* already a holder */
679 else if (bdev->bd_holder != NULL) 690 else if (bdev->bd_holder != NULL)
680 res = -EBUSY; /* held by someone else */ 691 return false; /* held by someone else */
681 else if (bdev->bd_contains == bdev) 692 else if (bdev->bd_contains == bdev)
682 res = 0; /* is a whole device which isn't held */ 693 return true; /* is a whole device which isn't held */
683 694
684 else if (bdev->bd_contains->bd_holder == bd_claim) 695 else if (whole->bd_holder == bd_claim)
685 res = 0; /* is a partition of a device that is being partitioned */ 696 return true; /* is a partition of a device that is being partitioned */
686 else if (bdev->bd_contains->bd_holder != NULL) 697 else if (whole->bd_holder != NULL)
687 res = -EBUSY; /* is a partition of a held device */ 698 return false; /* is a partition of a held device */
688 else 699 else
689 res = 0; /* is a partition of an un-held device */ 700 return true; /* is a partition of an un-held device */
701}
702
703/**
704 * bd_prepare_to_claim - prepare to claim a block device
705 * @bdev: block device of interest
706 * @whole: the whole device containing @bdev, may equal @bdev
707 * @holder: holder trying to claim @bdev
708 *
709 * Prepare to claim @bdev. This function fails if @bdev is already
710 * claimed by another holder and waits if another claiming is in
711 * progress. This function doesn't actually claim. On successful
712 * return, the caller has ownership of bd_claiming and bd_holder[s].
713 *
714 * CONTEXT:
715 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
716 * it multiple times.
717 *
718 * RETURNS:
719 * 0 if @bdev can be claimed, -EBUSY otherwise.
720 */
721static int bd_prepare_to_claim(struct block_device *bdev,
722 struct block_device *whole, void *holder)
723{
724retry:
725 /* if someone else claimed, fail */
726 if (!bd_may_claim(bdev, whole, holder))
727 return -EBUSY;
728
729 /* if someone else is claiming, wait for it to finish */
730 if (whole->bd_claiming && whole->bd_claiming != holder) {
731 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
732 DEFINE_WAIT(wait);
733
734 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
735 spin_unlock(&bdev_lock);
736 schedule();
737 finish_wait(wq, &wait);
738 spin_lock(&bdev_lock);
739 goto retry;
740 }
741
742 /* yay, all mine */
743 return 0;
744}
745
746/**
747 * bd_start_claiming - start claiming a block device
748 * @bdev: block device of interest
749 * @holder: holder trying to claim @bdev
750 *
751 * @bdev is about to be opened exclusively. Check @bdev can be opened
752 * exclusively and mark that an exclusive open is in progress. Each
753 * successful call to this function must be matched with a call to
754 * either bd_claim() or bd_abort_claiming(). If this function
755 * succeeds, the matching bd_claim() is guaranteed to succeed.
756 *
757 * CONTEXT:
758 * Might sleep.
759 *
760 * RETURNS:
761 * Pointer to the block device containing @bdev on success, ERR_PTR()
762 * value on failure.
763 */
764static struct block_device *bd_start_claiming(struct block_device *bdev,
765 void *holder)
766{
767 struct gendisk *disk;
768 struct block_device *whole;
769 int partno, err;
770
771 might_sleep();
772
773 /*
774 * @bdev might not have been initialized properly yet, look up
775 * and grab the outer block device the hard way.
776 */
777 disk = get_gendisk(bdev->bd_dev, &partno);
778 if (!disk)
779 return ERR_PTR(-ENXIO);
780
781 whole = bdget_disk(disk, 0);
782 put_disk(disk);
783 if (!whole)
784 return ERR_PTR(-ENOMEM);
785
786 /* prepare to claim, if successful, mark claiming in progress */
787 spin_lock(&bdev_lock);
788
789 err = bd_prepare_to_claim(bdev, whole, holder);
790 if (err == 0) {
791 whole->bd_claiming = holder;
792 spin_unlock(&bdev_lock);
793 return whole;
794 } else {
795 spin_unlock(&bdev_lock);
796 bdput(whole);
797 return ERR_PTR(err);
798 }
799}
690 800
691 /* now impose change */ 801/* releases bdev_lock */
692 if (res==0) { 802static void __bd_abort_claiming(struct block_device *whole, void *holder)
803{
804 BUG_ON(whole->bd_claiming != holder);
805 whole->bd_claiming = NULL;
806 wake_up_bit(&whole->bd_claiming, 0);
807
808 spin_unlock(&bdev_lock);
809 bdput(whole);
810}
811
812/**
813 * bd_abort_claiming - abort claiming a block device
814 * @whole: whole block device returned by bd_start_claiming()
815 * @holder: holder trying to claim @bdev
816 *
817 * Abort a claiming block started by bd_start_claiming(). Note that
818 * @whole is not the block device to be claimed but the whole device
819 * returned by bd_start_claiming().
820 *
821 * CONTEXT:
822 * Grabs and releases bdev_lock.
823 */
824static void bd_abort_claiming(struct block_device *whole, void *holder)
825{
826 spin_lock(&bdev_lock);
827 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
828}
829
830/**
831 * bd_claim - claim a block device
832 * @bdev: block device to claim
833 * @holder: holder trying to claim @bdev
834 *
835 * Try to claim @bdev which must have been opened successfully. This
836 * function may be called with or without preceding
837 * blk_start_claiming(). In the former case, this function is always
838 * successful and terminates the claiming block.
839 *
840 * CONTEXT:
841 * Might sleep.
842 *
843 * RETURNS:
844 * 0 if successful, -EBUSY if @bdev is already claimed.
845 */
846int bd_claim(struct block_device *bdev, void *holder)
847{
848 struct block_device *whole = bdev->bd_contains;
849 int res;
850
851 might_sleep();
852
853 spin_lock(&bdev_lock);
854
855 res = bd_prepare_to_claim(bdev, whole, holder);
856 if (res == 0) {
693 /* note that for a whole device bd_holders 857 /* note that for a whole device bd_holders
694 * will be incremented twice, and bd_holder will 858 * will be incremented twice, and bd_holder will
695 * be set to bd_claim before being set to holder 859 * be set to bd_claim before being set to holder
696 */ 860 */
697 bdev->bd_contains->bd_holders ++; 861 whole->bd_holders++;
698 bdev->bd_contains->bd_holder = bd_claim; 862 whole->bd_holder = bd_claim;
699 bdev->bd_holders++; 863 bdev->bd_holders++;
700 bdev->bd_holder = holder; 864 bdev->bd_holder = holder;
701 } 865 }
702 spin_unlock(&bdev_lock); 866
867 if (whole->bd_claiming)
868 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
869 else
870 spin_unlock(&bdev_lock);
871
703 return res; 872 return res;
704} 873}
705
706EXPORT_SYMBOL(bd_claim); 874EXPORT_SYMBOL(bd_claim);
707 875
708void bd_release(struct block_device *bdev) 876void bd_release(struct block_device *bdev)
@@ -1316,6 +1484,7 @@ EXPORT_SYMBOL(blkdev_get);
1316 1484
1317static int blkdev_open(struct inode * inode, struct file * filp) 1485static int blkdev_open(struct inode * inode, struct file * filp)
1318{ 1486{
1487 struct block_device *whole = NULL;
1319 struct block_device *bdev; 1488 struct block_device *bdev;
1320 int res; 1489 int res;
1321 1490
@@ -1338,22 +1507,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1338 if (bdev == NULL) 1507 if (bdev == NULL)
1339 return -ENOMEM; 1508 return -ENOMEM;
1340 1509
1510 if (filp->f_mode & FMODE_EXCL) {
1511 whole = bd_start_claiming(bdev, filp);
1512 if (IS_ERR(whole)) {
1513 bdput(bdev);
1514 return PTR_ERR(whole);
1515 }
1516 }
1517
1341 filp->f_mapping = bdev->bd_inode->i_mapping; 1518 filp->f_mapping = bdev->bd_inode->i_mapping;
1342 1519
1343 res = blkdev_get(bdev, filp->f_mode); 1520 res = blkdev_get(bdev, filp->f_mode);
1344 if (res)
1345 return res;
1346 1521
1347 if (filp->f_mode & FMODE_EXCL) { 1522 if (whole) {
1348 res = bd_claim(bdev, filp); 1523 if (res == 0)
1349 if (res) 1524 BUG_ON(bd_claim(bdev, filp) != 0);
1350 goto out_blkdev_put; 1525 else
1526 bd_abort_claiming(whole, filp);
1351 } 1527 }
1352 1528
1353 return 0;
1354
1355 out_blkdev_put:
1356 blkdev_put(bdev, filp->f_mode);
1357 return res; 1529 return res;
1358} 1530}
1359 1531
@@ -1564,27 +1736,34 @@ EXPORT_SYMBOL(lookup_bdev);
1564 */ 1736 */
1565struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1737struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1566{ 1738{
1567 struct block_device *bdev; 1739 struct block_device *bdev, *whole;
1568 int error = 0; 1740 int error;
1569 1741
1570 bdev = lookup_bdev(path); 1742 bdev = lookup_bdev(path);
1571 if (IS_ERR(bdev)) 1743 if (IS_ERR(bdev))
1572 return bdev; 1744 return bdev;
1573 1745
1746 whole = bd_start_claiming(bdev, holder);
1747 if (IS_ERR(whole)) {
1748 bdput(bdev);
1749 return whole;
1750 }
1751
1574 error = blkdev_get(bdev, mode); 1752 error = blkdev_get(bdev, mode);
1575 if (error) 1753 if (error)
1576 return ERR_PTR(error); 1754 goto out_abort_claiming;
1755
1577 error = -EACCES; 1756 error = -EACCES;
1578 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1757 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1579 goto blkdev_put; 1758 goto out_blkdev_put;
1580 error = bd_claim(bdev, holder);
1581 if (error)
1582 goto blkdev_put;
1583 1759
1760 BUG_ON(bd_claim(bdev, holder) != 0);
1584 return bdev; 1761 return bdev;
1585 1762
1586blkdev_put: 1763out_blkdev_put:
1587 blkdev_put(bdev, mode); 1764 blkdev_put(bdev, mode);
1765out_abort_claiming:
1766 bd_abort_claiming(whole, holder);
1588 return ERR_PTR(error); 1767 return ERR_PTR(error);
1589} 1768}
1590 1769
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1589 u64 start, u64 len) 1589 u64 start, u64 len)
1590{ 1590{
1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1592 DISCARD_FL_BARRIER); 1592 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1593} 1593}
1594 1594
1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..08e422d56996 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
275 return; 275 return;
276 276
277 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
278 lru_add_drain_all(); /* make sure all lru add caches are flushed */
278 invalidate_mapping_pages(mapping, 0, -1); 279 invalidate_mapping_pages(mapping, 0, -1);
279} 280}
280EXPORT_SYMBOL(invalidate_bdev); 281EXPORT_SYMBOL(invalidate_bdev);
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 26289e8f4163..fcf7487734b6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
90 * storage 90 * storage
91 */ 91 */
92 if (needs_barrier) 92 if (needs_barrier)
93 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
94 BLKDEV_IFL_WAIT);
94 return ret; 95 return ret;
95} 96}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
100 if (ext4_should_writeback_data(inode) && 100 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) && 101 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER)) 102 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 103 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 NULL, BLKDEV_IFL_WAIT);
104 jbd2_log_wait_commit(journal, commit_tid); 105 jbd2_log_wait_commit(journal, commit_tid);
105 } else if (journal->j_flags & JBD2_BARRIER) 106 } else if (journal->j_flags & JBD2_BARRIER)
106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 107 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
108 BLKDEV_IFL_WAIT);
107 return ret; 109 return ret;
108} 110}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0a140741b39e..f74d270ba155 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/pipe_fs_i.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/ptrace.h> 19#include <linux/ptrace.h>
19#include <linux/signal.h> 20#include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
412 case F_NOTIFY: 413 case F_NOTIFY:
413 err = fcntl_dirnotify(fd, filp, arg); 414 err = fcntl_dirnotify(fd, filp, arg);
414 break; 415 break;
416 case F_SETPIPE_SZ:
417 case F_GETPIPE_SZ:
418 err = pipe_fcntl(filp, cmd, arg);
419 break;
415 default: 420 default:
416 break; 421 break;
417 } 422 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..437a7431b4ea 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_args {
45 int for_kupdate:1; 45 int for_kupdate:1;
46 int range_cyclic:1; 46 int range_cyclic:1;
47 int for_background:1; 47 int for_background:1;
48 int sb_pinned:1;
48}; 49};
49 50
50/* 51/*
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
192} 193}
193 194
194static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 195static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
195 struct wb_writeback_args *args) 196 struct wb_writeback_args *args,
197 int wait)
196{ 198{
197 struct bdi_work *work; 199 struct bdi_work *work;
198 200
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
204 if (work) { 206 if (work) {
205 bdi_work_init(work, args); 207 bdi_work_init(work, args);
206 bdi_queue_work(bdi, work); 208 bdi_queue_work(bdi, work);
209 if (wait)
210 bdi_wait_on_work_clear(work);
207 } else { 211 } else {
208 struct bdi_writeback *wb = &bdi->wb; 212 struct bdi_writeback *wb = &bdi->wb;
209 213
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
230 .sync_mode = WB_SYNC_ALL, 234 .sync_mode = WB_SYNC_ALL,
231 .nr_pages = LONG_MAX, 235 .nr_pages = LONG_MAX,
232 .range_cyclic = 0, 236 .range_cyclic = 0,
237 /*
238 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
239 * lets make it explicitly clear.
240 */
241 .sb_pinned = 1,
233 }; 242 };
234 struct bdi_work work; 243 struct bdi_work work;
235 244
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
245 * @bdi: the backing device to write from 254 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block 255 * @sb: write inodes from this super_block
247 * @nr_pages: the number of pages to write 256 * @nr_pages: the number of pages to write
257 * @sb_locked: caller already holds sb umount sem.
248 * 258 *
249 * Description: 259 * Description:
250 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 260 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
251 * started when this function returns, we make no guarentees on 261 * started when this function returns, we make no guarentees on
252 * completion. Caller need not hold sb s_umount semaphore. 262 * completion. Caller specifies whether sb umount sem is held already or not.
253 * 263 *
254 */ 264 */
255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 265void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
256 long nr_pages) 266 long nr_pages, int sb_locked)
257{ 267{
258 struct wb_writeback_args args = { 268 struct wb_writeback_args args = {
259 .sb = sb, 269 .sb = sb,
260 .sync_mode = WB_SYNC_NONE, 270 .sync_mode = WB_SYNC_NONE,
261 .nr_pages = nr_pages, 271 .nr_pages = nr_pages,
262 .range_cyclic = 1, 272 .range_cyclic = 1,
273 .sb_pinned = sb_locked,
263 }; 274 };
264 275
265 /* 276 /*
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
271 args.for_background = 1; 282 args.for_background = 1;
272 } 283 }
273 284
274 bdi_alloc_queue_work(bdi, &args); 285 bdi_alloc_queue_work(bdi, &args, sb_locked);
275} 286}
276 287
277/* 288/*
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
452 463
453 BUG_ON(inode->i_state & I_SYNC); 464 BUG_ON(inode->i_state & I_SYNC);
454 465
455 /* Set I_SYNC, reset I_DIRTY */ 466 /* Set I_SYNC, reset I_DIRTY_PAGES */
456 dirty = inode->i_state & I_DIRTY;
457 inode->i_state |= I_SYNC; 467 inode->i_state |= I_SYNC;
458 inode->i_state &= ~I_DIRTY; 468 inode->i_state &= ~I_DIRTY_PAGES;
459
460 spin_unlock(&inode_lock); 469 spin_unlock(&inode_lock);
461 470
462 ret = do_writepages(mapping, wbc); 471 ret = do_writepages(mapping, wbc);
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
472 ret = err; 481 ret = err;
473 } 482 }
474 483
484 /*
485 * Some filesystems may redirty the inode during the writeback
486 * due to delalloc, clear dirty metadata flags right before
487 * write_inode()
488 */
489 spin_lock(&inode_lock);
490 dirty = inode->i_state & I_DIRTY;
491 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
492 spin_unlock(&inode_lock);
475 /* Don't write the inode if only I_DIRTY_PAGES was set */ 493 /* Don't write the inode if only I_DIRTY_PAGES was set */
476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 494 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc); 495 int err = write_inode(inode, wbc);
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
577 /* 595 /*
578 * Caller must already hold the ref for this 596 * Caller must already hold the ref for this
579 */ 597 */
580 if (wbc->sync_mode == WB_SYNC_ALL) { 598 if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
581 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 599 WARN_ON(!rwsem_is_locked(&sb->s_umount));
582 return SB_NOT_PINNED; 600 return SB_NOT_PINNED;
583 } 601 }
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
751 .for_kupdate = args->for_kupdate, 769 .for_kupdate = args->for_kupdate,
752 .for_background = args->for_background, 770 .for_background = args->for_background,
753 .range_cyclic = args->range_cyclic, 771 .range_cyclic = args->range_cyclic,
772 .sb_pinned = args->sb_pinned,
754 }; 773 };
755 unsigned long oldest_jif; 774 unsigned long oldest_jif;
756 long wrote = 0; 775 long wrote = 0;
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
852 unsigned long expired; 871 unsigned long expired;
853 long nr_pages; 872 long nr_pages;
854 873
874 /*
875 * When set to zero, disable periodic writeback
876 */
877 if (!dirty_writeback_interval)
878 return 0;
879
855 expired = wb->last_old_flush + 880 expired = wb->last_old_flush +
856 msecs_to_jiffies(dirty_writeback_interval * 10); 881 msecs_to_jiffies(dirty_writeback_interval * 10);
857 if (time_before(jiffies, expired)) 882 if (time_before(jiffies, expired))
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
887 912
888 while ((work = get_next_work_item(bdi, wb)) != NULL) { 913 while ((work = get_next_work_item(bdi, wb)) != NULL) {
889 struct wb_writeback_args args = work->args; 914 struct wb_writeback_args args = work->args;
915 int post_clear;
890 916
891 /* 917 /*
892 * Override sync mode, in case we must wait for completion 918 * Override sync mode, in case we must wait for completion
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
894 if (force_wait) 920 if (force_wait)
895 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
896 922
923 post_clear = WB_SYNC_ALL || args.sb_pinned;
924
897 /* 925 /*
898 * If this isn't a data integrity operation, just notify 926 * If this isn't a data integrity operation, just notify
899 * that we have seen this work and we are now starting it. 927 * that we have seen this work and we are now starting it.
900 */ 928 */
901 if (args.sync_mode == WB_SYNC_NONE) 929 if (!post_clear)
902 wb_clear_pending(wb, work); 930 wb_clear_pending(wb, work);
903 931
904 wrote += wb_writeback(wb, &args); 932 wrote += wb_writeback(wb, &args);
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
907 * This is a data integrity writeback, so only do the 935 * This is a data integrity writeback, so only do the
908 * notification when we have completed the work. 936 * notification when we have completed the work.
909 */ 937 */
910 if (args.sync_mode == WB_SYNC_ALL) 938 if (post_clear)
911 wb_clear_pending(wb, work); 939 wb_clear_pending(wb, work);
912 } 940 }
913 941
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
947 break; 975 break;
948 } 976 }
949 977
950 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 978 if (dirty_writeback_interval) {
951 schedule_timeout_interruptible(wait_jiffies); 979 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
980 schedule_timeout_interruptible(wait_jiffies);
981 } else {
982 set_current_state(TASK_INTERRUPTIBLE);
983 if (list_empty_careful(&wb->bdi->work_list) &&
984 !kthread_should_stop())
985 schedule();
986 __set_current_state(TASK_RUNNING);
987 }
988
952 try_to_freeze(); 989 try_to_freeze();
953 } 990 }
954 991
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
974 if (!bdi_has_dirty_io(bdi)) 1011 if (!bdi_has_dirty_io(bdi))
975 continue; 1012 continue;
976 1013
977 bdi_alloc_queue_work(bdi, &args); 1014 bdi_alloc_queue_work(bdi, &args, 0);
978 } 1015 }
979 1016
980 rcu_read_unlock(); 1017 rcu_read_unlock();
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
1183 iput(old_inode); 1220 iput(old_inode);
1184} 1221}
1185 1222
1223static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1224{
1225 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1226 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1227 long nr_to_write;
1228
1229 nr_to_write = nr_dirty + nr_unstable +
1230 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1231
1232 bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
1233}
1234
1186/** 1235/**
1187 * writeback_inodes_sb - writeback dirty inodes from given super_block 1236 * writeback_inodes_sb - writeback dirty inodes from given super_block
1188 * @sb: the superblock 1237 * @sb: the superblock
@@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb)
1194 */ 1243 */
1195void writeback_inodes_sb(struct super_block *sb) 1244void writeback_inodes_sb(struct super_block *sb)
1196{ 1245{
1197 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1246 __writeback_inodes_sb(sb, 0);
1198 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1199 long nr_to_write;
1200
1201 nr_to_write = nr_dirty + nr_unstable +
1202 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1203
1204 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1205} 1247}
1206EXPORT_SYMBOL(writeback_inodes_sb); 1248EXPORT_SYMBOL(writeback_inodes_sb);
1207 1249
1208/** 1250/**
1251 * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
1252 * @sb: the superblock
1253 *
1254 * Like writeback_inodes_sb(), except the caller already holds the
1255 * sb umount sem.
1256 */
1257void writeback_inodes_sb_locked(struct super_block *sb)
1258{
1259 __writeback_inodes_sb(sb, 1);
1260}
1261
1262/**
1209 * writeback_inodes_sb_if_idle - start writeback if none underway 1263 * writeback_inodes_sb_if_idle - start writeback if none underway
1210 * @sb: the superblock 1264 * @sb: the superblock
1211 * 1265 *
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8bce73ed4d8e..117fa4171f62 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 854 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 855 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 856 nr_sects, GFP_NOFS,
857 DISCARD_FL_BARRIER); 857 BLKDEV_IFL_WAIT |
858 BLKDEV_IFL_BARRIER);
858 if (rv) 859 if (rv)
859 goto fail; 860 goto fail;
860 nr_sects = 0; 861 nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
869 } 870 }
870 if (nr_sects) { 871 if (nr_sects) {
871 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
872 DISCARD_FL_BARRIER); 873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
873 if (rv) 874 if (rv)
874 goto fail; 875 goto fail;
875 } 876 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 530 */
531 if ((journal->j_fs_dev != journal->j_dev) && 531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL); 533 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
534 BLKDEV_IFL_WAIT);
534 if (!(journal->j_flags & JBD2_ABORT)) 535 if (!(journal->j_flags & JBD2_ABORT))
535 jbd2_journal_update_superblock(journal, 1); 536 jbd2_journal_update_superblock(journal, 1);
536 return 0; 537 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL); 720 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
721 BLKDEV_IFL_WAIT);
721 722
722 /* Done it all: now write the commit record asynchronously. */ 723 /* Done it all: now write the commit record asynchronously. */
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 724 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
727 if (err) 728 if (err)
728 __jbd2_journal_abort_hard(journal); 729 __jbd2_journal_abort_hard(journal);
729 if (journal->j_flags & JBD2_BARRIER) 730 if (journal->j_flags & JBD2_BARRIER)
730 blkdev_issue_flush(journal->j_dev, NULL); 731 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 BLKDEV_IFL_WAIT);
731 } 733 }
732 734
733 err = journal_finish_inode_data_buffers(journal, commit_transaction); 735 err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a756168a21c2..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
674 start * sects_per_block, 674 start * sects_per_block,
675 nblocks * sects_per_block, 675 nblocks * sects_per_block,
676 GFP_NOFS, 676 GFP_NOFS,
677 DISCARD_FL_BARRIER); 677 BLKDEV_IFL_BARRIER);
678 if (ret < 0) 678 if (ret < 0)
679 return ret; 679 return ret;
680 nblocks = 0; 680 nblocks = 0;
@@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
684 ret = blkdev_issue_discard(nilfs->ns_bdev, 684 ret = blkdev_issue_discard(nilfs->ns_bdev,
685 start * sects_per_block, 685 start * sects_per_block,
686 nblocks * sects_per_block, 686 nblocks * sects_per_block,
687 GFP_NOFS, DISCARD_FL_BARRIER); 687 GFP_NOFS, BLKDEV_IFL_BARRIER);
688 return ret; 688 return ret;
689} 689}
690 690
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
70 70
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS) 72 defined(CONFIG_ACORN_PARTITION_ADFS)
73static int 73static int riscix_partition(struct parsed_partitions *state,
74riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 74 unsigned long first_sect, int slot,
75 unsigned long first_sect, int slot, unsigned long nr_sects) 75 unsigned long nr_sects)
76{ 76{
77 Sector sect; 77 Sector sect;
78 struct riscix_record *rr; 78 struct riscix_record *rr;
79 79
80 rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect); 80 rr = read_part_sector(state, first_sect, &sect);
81 if (!rr) 81 if (!rr)
82 return -1; 82 return -1;
83 83
@@ -123,9 +123,9 @@ struct linux_part {
123 123
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS) 125 defined(CONFIG_ACORN_PARTITION_ADFS)
126static int 126static int linux_partition(struct parsed_partitions *state,
127linux_partition(struct parsed_partitions *state, struct block_device *bdev, 127 unsigned long first_sect, int slot,
128 unsigned long first_sect, int slot, unsigned long nr_sects) 128 unsigned long nr_sects)
129{ 129{
130 Sector sect; 130 Sector sect;
131 struct linux_part *linuxp; 131 struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
135 135
136 put_partition(state, slot++, first_sect, size); 136 put_partition(state, slot++, first_sect, size);
137 137
138 linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect); 138 linuxp = read_part_sector(state, first_sect, &sect);
139 if (!linuxp) 139 if (!linuxp)
140 return -1; 140 return -1;
141 141
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
157#endif 157#endif
158 158
159#ifdef CONFIG_ACORN_PARTITION_CUMANA 159#ifdef CONFIG_ACORN_PARTITION_CUMANA
160int 160int adfspart_check_CUMANA(struct parsed_partitions *state)
161adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
162{ 161{
163 unsigned long first_sector = 0; 162 unsigned long first_sector = 0;
164 unsigned int start_blk = 0; 163 unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
185 struct adfs_discrecord *dr; 184 struct adfs_discrecord *dr;
186 unsigned int nr_sects; 185 unsigned int nr_sects;
187 186
188 data = read_dev_sector(bdev, start_blk * 2 + 6, &sect); 187 data = read_part_sector(state, start_blk * 2 + 6, &sect);
189 if (!data) 188 if (!data)
190 return -1; 189 return -1;
191 190
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
217#ifdef CONFIG_ACORN_PARTITION_RISCIX 216#ifdef CONFIG_ACORN_PARTITION_RISCIX
218 case PARTITION_RISCIX_SCSI: 217 case PARTITION_RISCIX_SCSI:
219 /* RISCiX - we don't know how to find the next one. */ 218 /* RISCiX - we don't know how to find the next one. */
220 slot = riscix_partition(state, bdev, first_sector, 219 slot = riscix_partition(state, first_sector, slot,
221 slot, nr_sects); 220 nr_sects);
222 break; 221 break;
223#endif 222#endif
224 223
225 case PARTITION_LINUX: 224 case PARTITION_LINUX:
226 slot = linux_partition(state, bdev, first_sector, 225 slot = linux_partition(state, first_sector, slot,
227 slot, nr_sects); 226 nr_sects);
228 break; 227 break;
229 } 228 }
230 put_dev_sector(sect); 229 put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
249 * hda1 = ADFS partition on first drive. 248 * hda1 = ADFS partition on first drive.
250 * hda2 = non-ADFS partition. 249 * hda2 = non-ADFS partition.
251 */ 250 */
252int 251int adfspart_check_ADFS(struct parsed_partitions *state)
253adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
254{ 252{
255 unsigned long start_sect, nr_sects, sectscyl, heads; 253 unsigned long start_sect, nr_sects, sectscyl, heads;
256 Sector sect; 254 Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
259 unsigned char id; 257 unsigned char id;
260 int slot = 1; 258 int slot = 1;
261 259
262 data = read_dev_sector(bdev, 6, &sect); 260 data = read_part_sector(state, 6, &sect);
263 if (!data) 261 if (!data)
264 return -1; 262 return -1;
265 263
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
278 /* 276 /*
279 * Work out start of non-adfs partition. 277 * Work out start of non-adfs partition.
280 */ 278 */
281 nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; 279 nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
282 280
283 if (start_sect) { 281 if (start_sect) {
284 switch (id) { 282 switch (id) {
285#ifdef CONFIG_ACORN_PARTITION_RISCIX 283#ifdef CONFIG_ACORN_PARTITION_RISCIX
286 case PARTITION_RISCIX_SCSI: 284 case PARTITION_RISCIX_SCSI:
287 case PARTITION_RISCIX_MFM: 285 case PARTITION_RISCIX_MFM:
288 slot = riscix_partition(state, bdev, start_sect, 286 slot = riscix_partition(state, start_sect, slot,
289 slot, nr_sects); 287 nr_sects);
290 break; 288 break;
291#endif 289#endif
292 290
293 case PARTITION_LINUX: 291 case PARTITION_LINUX:
294 slot = linux_partition(state, bdev, start_sect, 292 slot = linux_partition(state, start_sect, slot,
295 slot, nr_sects); 293 nr_sects);
296 break; 294 break;
297 } 295 }
298 } 296 }
@@ -308,10 +306,11 @@ struct ics_part {
308 __le32 size; 306 __le32 size;
309}; 307};
310 308
311static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) 309static int adfspart_check_ICSLinux(struct parsed_partitions *state,
310 unsigned long block)
312{ 311{
313 Sector sect; 312 Sector sect;
314 unsigned char *data = read_dev_sector(bdev, block, &sect); 313 unsigned char *data = read_part_sector(state, block, &sect);
315 int result = 0; 314 int result = 0;
316 315
317 if (data) { 316 if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
349 * hda2 = ADFS partition 1 on first drive. 348 * hda2 = ADFS partition 1 on first drive.
350 * ..etc.. 349 * ..etc..
351 */ 350 */
352int 351int adfspart_check_ICS(struct parsed_partitions *state)
353adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
354{ 352{
355 const unsigned char *data; 353 const unsigned char *data;
356 const struct ics_part *p; 354 const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
360 /* 358 /*
361 * Try ICS style partitions - sector 0 contains partition info. 359 * Try ICS style partitions - sector 0 contains partition info.
362 */ 360 */
363 data = read_dev_sector(bdev, 0, &sect); 361 data = read_part_sector(state, 0, &sect);
364 if (!data) 362 if (!data)
365 return -1; 363 return -1;
366 364
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
392 * partition is. We must not make this visible 390 * partition is. We must not make this visible
393 * to the filesystem. 391 * to the filesystem.
394 */ 392 */
395 if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { 393 if (size > 1 && adfspart_check_ICSLinux(state, start)) {
396 start += 1; 394 start += 1;
397 size -= 1; 395 size -= 1;
398 } 396 }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
446 * hda2 = ADFS partition 1 on first drive. 444 * hda2 = ADFS partition 1 on first drive.
447 * ..etc.. 445 * ..etc..
448 */ 446 */
449int 447int adfspart_check_POWERTEC(struct parsed_partitions *state)
450adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
451{ 448{
452 Sector sect; 449 Sector sect;
453 const unsigned char *data; 450 const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
455 int slot = 1; 452 int slot = 1;
456 int i; 453 int i;
457 454
458 data = read_dev_sector(bdev, 0, &sect); 455 data = read_part_sector(state, 0, &sect);
459 if (!data) 456 if (!data)
460 return -1; 457 return -1;
461 458
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
508 * 1. The individual ADFS boot block entries that are placed on the disk. 505 * 1. The individual ADFS boot block entries that are placed on the disk.
509 * 2. The start address of the next entry. 506 * 2. The start address of the next entry.
510 */ 507 */
511int 508int adfspart_check_EESOX(struct parsed_partitions *state)
512adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
513{ 509{
514 Sector sect; 510 Sector sect;
515 const unsigned char *data; 511 const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
518 sector_t start = 0; 514 sector_t start = 0;
519 int i, slot = 1; 515 int i, slot = 1;
520 516
521 data = read_dev_sector(bdev, 7, &sect); 517 data = read_part_sector(state, 7, &sect);
522 if (!data) 518 if (!data)
523 return -1; 519 return -1;
524 520
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
545 if (i != 0) { 541 if (i != 0) {
546 sector_t size; 542 sector_t size;
547 543
548 size = get_capacity(bdev->bd_disk); 544 size = get_capacity(state->bdev->bd_disk);
549 put_partition(state, slot++, start, size - start); 545 put_partition(state, slot++, start, size - start);
550 printk("\n"); 546 printk("\n");
551 } 547 }
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
7 * format, and everyone stick to it? 7 * format, and everyone stick to it?
8 */ 8 */
9 9
10int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); 10int adfspart_check_CUMANA(struct parsed_partitions *state);
11int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); 11int adfspart_check_ADFS(struct parsed_partitions *state);
12int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); 12int adfspart_check_ICS(struct parsed_partitions *state);
13int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); 13int adfspart_check_POWERTEC(struct parsed_partitions *state);
14int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); 14int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
23 return sum; 23 return sum;
24} 24}
25 25
26int 26int amiga_partition(struct parsed_partitions *state)
27amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
28{ 27{
29 Sector sect; 28 Sector sect;
30 unsigned char *data; 29 unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
38 for (blk = 0; ; blk++, put_dev_sector(sect)) { 37 for (blk = 0; ; blk++, put_dev_sector(sect)) {
39 if (blk == RDB_ALLOCATION_LIMIT) 38 if (blk == RDB_ALLOCATION_LIMIT)
40 goto rdb_done; 39 goto rdb_done;
41 data = read_dev_sector(bdev, blk, &sect); 40 data = read_part_sector(state, blk, &sect);
42 if (!data) { 41 if (!data) {
43 if (warn_no_part) 42 if (warn_no_part)
44 printk("Dev %s: unable to read RDB block %d\n", 43 printk("Dev %s: unable to read RDB block %d\n",
45 bdevname(bdev, b), blk); 44 bdevname(state->bdev, b), blk);
46 res = -1; 45 res = -1;
47 goto rdb_done; 46 goto rdb_done;
48 } 47 }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
64 } 63 }
65 64
66 printk("Dev %s: RDB in block %d has bad checksum\n", 65 printk("Dev %s: RDB in block %d has bad checksum\n",
67 bdevname(bdev, b), blk); 66 bdevname(state->bdev, b), blk);
68 } 67 }
69 68
70 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
75 put_dev_sector(sect); 74 put_dev_sector(sect);
76 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 75 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
77 blk *= blksize; /* Read in terms partition table understands */ 76 blk *= blksize; /* Read in terms partition table understands */
78 data = read_dev_sector(bdev, blk, &sect); 77 data = read_part_sector(state, blk, &sect);
79 if (!data) { 78 if (!data) {
80 if (warn_no_part) 79 if (warn_no_part)
81 printk("Dev %s: unable to read partition block %d\n", 80 printk("Dev %s: unable to read partition block %d\n",
82 bdevname(bdev, b), blk); 81 bdevname(state->bdev, b), blk);
83 res = -1; 82 res = -1;
84 goto rdb_done; 83 goto rdb_done;
85 } 84 }
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
2 * fs/partitions/amiga.h 2 * fs/partitions/amiga.h
3 */ 3 */
4 4
5int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); 5int amiga_partition(struct parsed_partitions *state);
6 6
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
30 memcmp (s, "RAW", 3) == 0 ; 30 memcmp (s, "RAW", 3) == 0 ;
31} 31}
32 32
33int atari_partition(struct parsed_partitions *state, struct block_device *bdev) 33int atari_partition(struct parsed_partitions *state)
34{ 34{
35 Sector sect; 35 Sector sect;
36 struct rootsector *rs; 36 struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ 42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
43#endif 43#endif
44 44
45 rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect); 45 rs = read_part_sector(state, 0, &sect);
46 if (!rs) 46 if (!rs)
47 return -1; 47 return -1;
48 48
49 /* Verify this is an Atari rootsector: */ 49 /* Verify this is an Atari rootsector: */
50 hd_size = bdev->bd_inode->i_size >> 9; 50 hd_size = state->bdev->bd_inode->i_size >> 9;
51 if (!VALID_PARTITION(&rs->part[0], hd_size) && 51 if (!VALID_PARTITION(&rs->part[0], hd_size) &&
52 !VALID_PARTITION(&rs->part[1], hd_size) && 52 !VALID_PARTITION(&rs->part[1], hd_size) &&
53 !VALID_PARTITION(&rs->part[2], hd_size) && 53 !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
84 printk(" XGM<"); 84 printk(" XGM<");
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
88 if (!xrs) { 88 if (!xrs) {
89 printk (" block %ld read failed\n", partsect); 89 printk (" block %ld read failed\n", partsect);
90 put_dev_sector(sect); 90 put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
31 u16 checksum; /* checksum for bootable disks */ 31 u16 checksum; /* checksum for bootable disks */
32} __attribute__((__packed__)); 32} __attribute__((__packed__));
33 33
34int atari_partition(struct parsed_partitions *state, struct block_device *bdev); 34int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
45 45
46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ 46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
47 47
48static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { 48static int (*check_part[])(struct parsed_partitions *) = {
49 /* 49 /*
50 * Probe partition formats with tables at disk address 0 50 * Probe partition formats with tables at disk address 0
51 * that also have an ADFS boot block at 0xdc0. 51 * that also have an ADFS boot block at 0xdc0.
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
161 struct parsed_partitions *state; 161 struct parsed_partitions *state;
162 int i, res, err; 162 int i, res, err;
163 163
164 state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
165 if (!state) 165 if (!state)
166 return NULL; 166 return NULL;
167 167
168 state->bdev = bdev;
168 disk_name(hd, 0, state->name); 169 disk_name(hd, 0, state->name);
169 printk(KERN_INFO " %s:", state->name); 170 printk(KERN_INFO " %s:", state->name);
170 if (isdigit(state->name[strlen(state->name)-1])) 171 if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
174 i = res = err = 0; 175 i = res = err = 0;
175 while (!res && check_part[i]) { 176 while (!res && check_part[i]) {
176 memset(&state->parts, 0, sizeof(state->parts)); 177 memset(&state->parts, 0, sizeof(state->parts));
177 res = check_part[i++](state, bdev); 178 res = check_part[i++](state);
178 if (res < 0) { 179 if (res < 0) {
179 /* We have hit an I/O error which we don't report now. 180 /* We have hit an I/O error which we don't report now.
180 * But record it, and let the others do their job. 181 * But record it, and let the others do their job.
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
186 } 187 }
187 if (res > 0) 188 if (res > 0)
188 return state; 189 return state;
190 if (state->access_beyond_eod)
191 err = -ENOSPC;
189 if (err) 192 if (err)
190 /* The partition is unrecognized. So report I/O errors if there were any */ 193 /* The partition is unrecognized. So report I/O errors if there were any */
191 res = err; 194 res = err;
@@ -538,12 +541,33 @@ exit:
538 disk_part_iter_exit(&piter); 541 disk_part_iter_exit(&piter);
539} 542}
540 543
544static bool disk_unlock_native_capacity(struct gendisk *disk)
545{
546 const struct block_device_operations *bdops = disk->fops;
547
548 if (bdops->unlock_native_capacity &&
549 !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
550 printk(KERN_CONT "enabling native capacity\n");
551 bdops->unlock_native_capacity(disk);
552 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
553 return true;
554 } else {
555 printk(KERN_CONT "truncated\n");
556 return false;
557 }
558}
559
541int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 560int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
542{ 561{
562 struct parsed_partitions *state = NULL;
543 struct disk_part_iter piter; 563 struct disk_part_iter piter;
544 struct hd_struct *part; 564 struct hd_struct *part;
545 struct parsed_partitions *state;
546 int p, highest, res; 565 int p, highest, res;
566rescan:
567 if (state && !IS_ERR(state)) {
568 kfree(state);
569 state = NULL;
570 }
547 571
548 if (bdev->bd_part_count) 572 if (bdev->bd_part_count)
549 return -EBUSY; 573 return -EBUSY;
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
562 bdev->bd_invalidated = 0; 586 bdev->bd_invalidated = 0;
563 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 587 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
564 return 0; 588 return 0;
565 if (IS_ERR(state)) /* I/O error reading the partition table */ 589 if (IS_ERR(state)) {
590 /*
591 * I/O error reading the partition table. If any
592 * partition code tried to read beyond EOD, retry
593 * after unlocking native capacity.
594 */
595 if (PTR_ERR(state) == -ENOSPC) {
596 printk(KERN_WARNING "%s: partition table beyond EOD, ",
597 disk->disk_name);
598 if (disk_unlock_native_capacity(disk))
599 goto rescan;
600 }
566 return -EIO; 601 return -EIO;
602 }
603 /*
604 * If any partition code tried to read beyond EOD, try
605 * unlocking native capacity even if partition table is
606 * sucessfully read as we could be missing some partitions.
607 */
608 if (state->access_beyond_eod) {
609 printk(KERN_WARNING
610 "%s: partition table partially beyond EOD, ",
611 disk->disk_name);
612 if (disk_unlock_native_capacity(disk))
613 goto rescan;
614 }
567 615
568 /* tell userspace that the media / partition table may have changed */ 616 /* tell userspace that the media / partition table may have changed */
569 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); 617 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
581 /* add partitions */ 629 /* add partitions */
582 for (p = 1; p < state->limit; p++) { 630 for (p = 1; p < state->limit; p++) {
583 sector_t size, from; 631 sector_t size, from;
584try_scan: 632
585 size = state->parts[p].size; 633 size = state->parts[p].size;
586 if (!size) 634 if (!size)
587 continue; 635 continue;
@@ -589,30 +637,21 @@ try_scan:
589 from = state->parts[p].from; 637 from = state->parts[p].from;
590 if (from >= get_capacity(disk)) { 638 if (from >= get_capacity(disk)) {
591 printk(KERN_WARNING 639 printk(KERN_WARNING
592 "%s: p%d ignored, start %llu is behind the end of the disk\n", 640 "%s: p%d start %llu is beyond EOD, ",
593 disk->disk_name, p, (unsigned long long) from); 641 disk->disk_name, p, (unsigned long long) from);
642 if (disk_unlock_native_capacity(disk))
643 goto rescan;
594 continue; 644 continue;
595 } 645 }
596 646
597 if (from + size > get_capacity(disk)) { 647 if (from + size > get_capacity(disk)) {
598 const struct block_device_operations *bdops = disk->fops;
599 unsigned long long capacity;
600
601 printk(KERN_WARNING 648 printk(KERN_WARNING
602 "%s: p%d size %llu exceeds device capacity, ", 649 "%s: p%d size %llu extends beyond EOD, ",
603 disk->disk_name, p, (unsigned long long) size); 650 disk->disk_name, p, (unsigned long long) size);
604 651
605 if (bdops->set_capacity && 652 if (disk_unlock_native_capacity(disk)) {
606 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { 653 /* free state and restart */
607 printk(KERN_CONT "enabling native capacity\n"); 654 goto rescan;
608 capacity = bdops->set_capacity(disk, ~0ULL);
609 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
610 if (capacity > get_capacity(disk)) {
611 set_capacity(disk, capacity);
612 check_disk_size_change(disk, bdev);
613 bdev->bd_invalidated = 0;
614 }
615 goto try_scan;
616 } else { 655 } else {
617 /* 656 /*
618 * we can not ignore partitions of broken tables 657 * we can not ignore partitions of broken tables
@@ -620,7 +659,6 @@ try_scan:
620 * we limit them to the end of the disk to avoid 659 * we limit them to the end of the disk to avoid
621 * creating invalid block devices 660 * creating invalid block devices
622 */ 661 */
623 printk(KERN_CONT "limited to end of disk\n");
624 size = get_capacity(disk) - from; 662 size = get_capacity(disk) - from;
625 } 663 }
626 } 664 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
6 * description. 6 * description.
7 */ 7 */
8struct parsed_partitions { 8struct parsed_partitions {
9 struct block_device *bdev;
9 char name[BDEVNAME_SIZE]; 10 char name[BDEVNAME_SIZE];
10 struct { 11 struct {
11 sector_t from; 12 sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
14 } parts[DISK_MAX_PARTS]; 15 } parts[DISK_MAX_PARTS];
15 int next; 16 int next;
16 int limit; 17 int limit;
18 bool access_beyond_eod;
17}; 19};
18 20
21static inline void *read_part_sector(struct parsed_partitions *state,
22 sector_t n, Sector *p)
23{
24 if (n >= get_capacity(state->bdev->bd_disk)) {
25 state->access_beyond_eod = true;
26 return NULL;
27 }
28 return read_dev_sector(state->bdev, n, p);
29}
30
19static inline void 31static inline void
20put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 32put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
21{ 33{
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..9e346c19bbba 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
140 * the part[0] entry for this disk, and is the number of 140 * the part[0] entry for this disk, and is the number of
141 * physical sectors available on the disk. 141 * physical sectors available on the disk.
142 */ 142 */
143static u64 143static u64 last_lba(struct block_device *bdev)
144last_lba(struct block_device *bdev)
145{ 144{
146 if (!bdev || !bdev->bd_inode) 145 if (!bdev || !bdev->bd_inode)
147 return 0; 146 return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
181 180
182/** 181/**
183 * read_lba(): Read bytes from disk, starting at given LBA 182 * read_lba(): Read bytes from disk, starting at given LBA
184 * @bdev 183 * @state
185 * @lba 184 * @lba
186 * @buffer 185 * @buffer
187 * @size_t 186 * @size_t
188 * 187 *
189 * Description: Reads @count bytes from @bdev into @buffer. 188 * Description: Reads @count bytes from @state->bdev into @buffer.
190 * Returns number of bytes read on success, 0 on error. 189 * Returns number of bytes read on success, 0 on error.
191 */ 190 */
192static size_t 191static size_t read_lba(struct parsed_partitions *state,
193read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 192 u64 lba, u8 *buffer, size_t count)
194{ 193{
195 size_t totalreadcount = 0; 194 size_t totalreadcount = 0;
195 struct block_device *bdev = state->bdev;
196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512); 196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
197 197
198 if (!bdev || !buffer || lba > last_lba(bdev)) 198 if (!buffer || lba > last_lba(bdev))
199 return 0; 199 return 0;
200 200
201 while (count) { 201 while (count) {
202 int copied = 512; 202 int copied = 512;
203 Sector sect; 203 Sector sect;
204 unsigned char *data = read_dev_sector(bdev, n++, &sect); 204 unsigned char *data = read_part_sector(state, n++, &sect);
205 if (!data) 205 if (!data)
206 break; 206 break;
207 if (copied > count) 207 if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
217 217
218/** 218/**
219 * alloc_read_gpt_entries(): reads partition entries from disk 219 * alloc_read_gpt_entries(): reads partition entries from disk
220 * @bdev 220 * @state
221 * @gpt - GPT header 221 * @gpt - GPT header
222 * 222 *
223 * Description: Returns ptes on success, NULL on error. 223 * Description: Returns ptes on success, NULL on error.
224 * Allocates space for PTEs based on information found in @gpt. 224 * Allocates space for PTEs based on information found in @gpt.
225 * Notes: remember to free pte when you're done! 225 * Notes: remember to free pte when you're done!
226 */ 226 */
227static gpt_entry * 227static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
228alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) 228 gpt_header *gpt)
229{ 229{
230 size_t count; 230 size_t count;
231 gpt_entry *pte; 231 gpt_entry *pte;
232 if (!bdev || !gpt) 232
233 if (!gpt)
233 return NULL; 234 return NULL;
234 235
235 count = le32_to_cpu(gpt->num_partition_entries) * 236 count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
240 if (!pte) 241 if (!pte)
241 return NULL; 242 return NULL;
242 243
243 if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), 244 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
244 (u8 *) pte, 245 (u8 *) pte,
245 count) < count) { 246 count) < count) {
246 kfree(pte); 247 kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
252 253
253/** 254/**
254 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk 255 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
255 * @bdev 256 * @state
256 * @lba is the Logical Block Address of the partition table 257 * @lba is the Logical Block Address of the partition table
257 * 258 *
258 * Description: returns GPT header on success, NULL on error. Allocates 259 * Description: returns GPT header on success, NULL on error. Allocates
259 * and fills a GPT header starting at @ from @bdev. 260 * and fills a GPT header starting at @ from @state->bdev.
260 * Note: remember to free gpt when finished with it. 261 * Note: remember to free gpt when finished with it.
261 */ 262 */
262static gpt_header * 263static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
263alloc_read_gpt_header(struct block_device *bdev, u64 lba) 264 u64 lba)
264{ 265{
265 gpt_header *gpt; 266 gpt_header *gpt;
266 unsigned ssz = bdev_logical_block_size(bdev); 267 unsigned ssz = bdev_logical_block_size(state->bdev);
267
268 if (!bdev)
269 return NULL;
270 268
271 gpt = kzalloc(ssz, GFP_KERNEL); 269 gpt = kzalloc(ssz, GFP_KERNEL);
272 if (!gpt) 270 if (!gpt)
273 return NULL; 271 return NULL;
274 272
275 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { 273 if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
276 kfree(gpt); 274 kfree(gpt);
277 gpt=NULL; 275 gpt=NULL;
278 return NULL; 276 return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
283 281
284/** 282/**
285 * is_gpt_valid() - tests one GPT header and PTEs for validity 283 * is_gpt_valid() - tests one GPT header and PTEs for validity
286 * @bdev 284 * @state
287 * @lba is the logical block address of the GPT header to test 285 * @lba is the logical block address of the GPT header to test
288 * @gpt is a GPT header ptr, filled on return. 286 * @gpt is a GPT header ptr, filled on return.
289 * @ptes is a PTEs ptr, filled on return. 287 * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
291 * Description: returns 1 if valid, 0 on error. 289 * Description: returns 1 if valid, 0 on error.
292 * If valid, returns pointers to newly allocated GPT header and PTEs. 290 * If valid, returns pointers to newly allocated GPT header and PTEs.
293 */ 291 */
294static int 292static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
295is_gpt_valid(struct block_device *bdev, u64 lba, 293 gpt_header **gpt, gpt_entry **ptes)
296 gpt_header **gpt, gpt_entry **ptes)
297{ 294{
298 u32 crc, origcrc; 295 u32 crc, origcrc;
299 u64 lastlba; 296 u64 lastlba;
300 297
301 if (!bdev || !gpt || !ptes) 298 if (!ptes)
302 return 0; 299 return 0;
303 if (!(*gpt = alloc_read_gpt_header(bdev, lba))) 300 if (!(*gpt = alloc_read_gpt_header(state, lba)))
304 return 0; 301 return 0;
305 302
306 /* Check the GUID Partition Table signature */ 303 /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
336 /* Check the first_usable_lba and last_usable_lba are 333 /* Check the first_usable_lba and last_usable_lba are
337 * within the disk. 334 * within the disk.
338 */ 335 */
339 lastlba = last_lba(bdev); 336 lastlba = last_lba(state->bdev);
340 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { 337 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
341 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", 338 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
342 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), 339 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
350 goto fail; 347 goto fail;
351 } 348 }
352 349
353 if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) 350 if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
354 goto fail; 351 goto fail;
355 352
356 /* Check the GUID Partition Entry Array CRC */ 353 /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
495 492
496/** 493/**
497 * find_valid_gpt() - Search disk for valid GPT headers and PTEs 494 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
498 * @bdev 495 * @state
499 * @gpt is a GPT header ptr, filled on return. 496 * @gpt is a GPT header ptr, filled on return.
500 * @ptes is a PTEs ptr, filled on return. 497 * @ptes is a PTEs ptr, filled on return.
501 * Description: Returns 1 if valid, 0 on error. 498 * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
508 * This protects against devices which misreport their size, and forces 505 * This protects against devices which misreport their size, and forces
509 * the user to decide to use the Alternate GPT. 506 * the user to decide to use the Alternate GPT.
510 */ 507 */
511static int 508static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
512find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) 509 gpt_entry **ptes)
513{ 510{
514 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; 511 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
515 gpt_header *pgpt = NULL, *agpt = NULL; 512 gpt_header *pgpt = NULL, *agpt = NULL;
516 gpt_entry *pptes = NULL, *aptes = NULL; 513 gpt_entry *pptes = NULL, *aptes = NULL;
517 legacy_mbr *legacymbr; 514 legacy_mbr *legacymbr;
518 u64 lastlba; 515 u64 lastlba;
519 if (!bdev || !gpt || !ptes) 516
517 if (!ptes)
520 return 0; 518 return 0;
521 519
522 lastlba = last_lba(bdev); 520 lastlba = last_lba(state->bdev);
523 if (!force_gpt) { 521 if (!force_gpt) {
524 /* This will be added to the EFI Spec. per Intel after v1.02. */ 522 /* This will be added to the EFI Spec. per Intel after v1.02. */
525 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); 523 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
526 if (legacymbr) { 524 if (legacymbr) {
527 read_lba(bdev, 0, (u8 *) legacymbr, 525 read_lba(state, 0, (u8 *) legacymbr,
528 sizeof (*legacymbr)); 526 sizeof (*legacymbr));
529 good_pmbr = is_pmbr_valid(legacymbr); 527 good_pmbr = is_pmbr_valid(legacymbr);
530 kfree(legacymbr); 528 kfree(legacymbr);
531 } 529 }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
533 goto fail; 531 goto fail;
534 } 532 }
535 533
536 good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, 534 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
537 &pgpt, &pptes); 535 &pgpt, &pptes);
538 if (good_pgpt) 536 if (good_pgpt)
539 good_agpt = is_gpt_valid(bdev, 537 good_agpt = is_gpt_valid(state,
540 le64_to_cpu(pgpt->alternate_lba), 538 le64_to_cpu(pgpt->alternate_lba),
541 &agpt, &aptes); 539 &agpt, &aptes);
542 if (!good_agpt && force_gpt) 540 if (!good_agpt && force_gpt)
543 good_agpt = is_gpt_valid(bdev, lastlba, 541 good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
544 &agpt, &aptes);
545 542
546 /* The obviously unsuccessful case */ 543 /* The obviously unsuccessful case */
547 if (!good_pgpt && !good_agpt) 544 if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
583} 580}
584 581
585/** 582/**
586 * efi_partition(struct parsed_partitions *state, struct block_device *bdev) 583 * efi_partition(struct parsed_partitions *state)
587 * @state 584 * @state
588 * @bdev
589 * 585 *
590 * Description: called from check.c, if the disk contains GPT 586 * Description: called from check.c, if the disk contains GPT
591 * partitions, sets up partition entries in the kernel. 587 * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
602 * 1 if successful 598 * 1 if successful
603 * 599 *
604 */ 600 */
605int 601int efi_partition(struct parsed_partitions *state)
606efi_partition(struct parsed_partitions *state, struct block_device *bdev)
607{ 602{
608 gpt_header *gpt = NULL; 603 gpt_header *gpt = NULL;
609 gpt_entry *ptes = NULL; 604 gpt_entry *ptes = NULL;
610 u32 i; 605 u32 i;
611 unsigned ssz = bdev_logical_block_size(bdev) / 512; 606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
612 607
613 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
614 kfree(gpt); 609 kfree(gpt);
615 kfree(ptes); 610 kfree(ptes);
616 return 0; 611 return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
623 u64 size = le64_to_cpu(ptes[i].ending_lba) - 618 u64 size = le64_to_cpu(ptes[i].ending_lba) -
624 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 619 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
625 620
626 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 621 if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
627 continue; 622 continue;
628 623
629 put_partition(state, i+1, start * ssz, size * ssz); 624 put_partition(state, i+1, start * ssz, size * ssz);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
110} __attribute__ ((packed)) legacy_mbr; 110} __attribute__ ((packed)) legacy_mbr;
111 111
112/* Functions */ 112/* Functions */
113extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); 113extern int efi_partition(struct parsed_partitions *state);
114 114
115#endif 115#endif
116 116
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
58 58
59/* 59/*
60 */ 60 */
61int 61int ibm_partition(struct parsed_partitions *state)
62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
63{ 62{
63 struct block_device *bdev = state->bdev;
64 int blocksize, res; 64 int blocksize, res;
65 loff_t i_size, offset, size, fmt_size; 65 loff_t i_size, offset, size, fmt_size;
66 dasd_information2_t *info; 66 dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
100 /* 100 /*
101 * Get volume label, extract name and type. 101 * Get volume label, extract name and type.
102 */ 102 */
103 data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect); 103 data = read_part_sector(state, info->label_block*(blocksize/512),
104 &sect);
104 if (data == NULL) 105 if (data == NULL)
105 goto out_readerr; 106 goto out_readerr;
106 107
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
193 */ 194 */
194 blk = cchhb2blk(&label->vol.vtoc, geo) + 1; 195 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
195 counter = 0; 196 counter = 0;
196 data = read_dev_sector(bdev, blk * (blocksize/512), 197 data = read_part_sector(state, blk * (blocksize/512),
197 &sect); 198 &sect);
198 while (data != NULL) { 199 while (data != NULL) {
199 struct vtoc_format1_label f1; 200 struct vtoc_format1_label f1;
200 201
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
208 || f1.DS1FMTID == _ascebc['7'] 209 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) { 210 || f1.DS1FMTID == _ascebc['9']) {
210 blk++; 211 blk++;
211 data = read_dev_sector(bdev, blk * 212 data = read_part_sector(state,
212 (blocksize/512), 213 blk * (blocksize/512), &sect);
213 &sect);
214 continue; 214 continue;
215 } 215 }
216 216
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
230 size * (blocksize >> 9)); 230 size * (blocksize >> 9));
231 counter++; 231 counter++;
232 blk++; 232 blk++;
233 data = read_dev_sector(bdev, 233 data = read_part_sector(state,
234 blk * (blocksize/512), 234 blk * (blocksize/512), &sect);
235 &sect);
236 } 235 }
237 236
238 if (!data) 237 if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
int ibm_partition(struct parsed_partitions *, struct block_device *); int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "karma.h" 10#include "karma.h"
11 11
12int karma_partition(struct parsed_partitions *state, struct block_device *bdev) 12int karma_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 int slot = 1; 15 int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
29 } __attribute__((packed)) *label; 29 } __attribute__((packed)) *label;
30 struct d_partition *p; 30 struct d_partition *p;
31 31
32 data = read_dev_sector(bdev, 0, &sect); 32 data = read_part_sector(state, 0, &sect);
33 if (!data) 33 if (!data)
34 return -1; 34 return -1;
35 35
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
4 4
5#define KARMA_LABEL_MAGIC 0xAB56 5#define KARMA_LABEL_MAGIC 0xAB56
6 6
7int karma_partition(struct parsed_partitions *state, struct block_device *bdev); 7int karma_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..3ceca05b668c 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
309 309
310/** 310/**
311 * ldm_validate_privheads - Compare the primary privhead with its backups 311 * ldm_validate_privheads - Compare the primary privhead with its backups
312 * @bdev: Device holding the LDM Database 312 * @state: Partition check state including device holding the LDM Database
313 * @ph1: Memory struct to fill with ph contents 313 * @ph1: Memory struct to fill with ph contents
314 * 314 *
315 * Read and compare all three privheads from disk. 315 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
321 * Return: 'true' Success 321 * Return: 'true' Success
322 * 'false' Error 322 * 'false' Error
323 */ 323 */
324static bool ldm_validate_privheads (struct block_device *bdev, 324static bool ldm_validate_privheads(struct parsed_partitions *state,
325 struct privhead *ph1) 325 struct privhead *ph1)
326{ 326{
327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; 327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
328 struct privhead *ph[3] = { ph1 }; 328 struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
332 long num_sects; 332 long num_sects;
333 int i; 333 int i;
334 334
335 BUG_ON (!bdev || !ph1); 335 BUG_ON (!state || !ph1);
336 336
337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); 337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); 338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
346 346
347 /* Read and parse privheads */ 347 /* Read and parse privheads */
348 for (i = 0; i < 3; i++) { 348 for (i = 0; i < 3; i++) {
349 data = read_dev_sector (bdev, 349 data = read_part_sector(state, ph[0]->config_start + off[i],
350 ph[0]->config_start + off[i], &sect); 350 &sect);
351 if (!data) { 351 if (!data) {
352 ldm_crit ("Disk read failed."); 352 ldm_crit ("Disk read failed.");
353 goto out; 353 goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
363 } 363 }
364 } 364 }
365 365
366 num_sects = bdev->bd_inode->i_size >> 9; 366 num_sects = state->bdev->bd_inode->i_size >> 9;
367 367
368 if ((ph[0]->config_start > num_sects) || 368 if ((ph[0]->config_start > num_sects) ||
369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { 369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
397 397
398/** 398/**
399 * ldm_validate_tocblocks - Validate the table of contents and its backups 399 * ldm_validate_tocblocks - Validate the table of contents and its backups
400 * @bdev: Device holding the LDM Database 400 * @state: Partition check state including device holding the LDM Database
401 * @base: Offset, into @bdev, of the database 401 * @base: Offset, into @state->bdev, of the database
402 * @ldb: Cache of the database structures 402 * @ldb: Cache of the database structures
403 * 403 *
404 * Find and compare the four tables of contents of the LDM Database stored on 404 * Find and compare the four tables of contents of the LDM Database stored on
405 * @bdev and return the parsed information into @toc1. 405 * @state->bdev and return the parsed information into @toc1.
406 * 406 *
407 * The offsets and sizes of the configs are range-checked against a privhead. 407 * The offsets and sizes of the configs are range-checked against a privhead.
408 * 408 *
409 * Return: 'true' @toc1 contains validated TOCBLOCK info 409 * Return: 'true' @toc1 contains validated TOCBLOCK info
410 * 'false' @toc1 contents are undefined 410 * 'false' @toc1 contents are undefined
411 */ 411 */
412static bool ldm_validate_tocblocks(struct block_device *bdev, 412static bool ldm_validate_tocblocks(struct parsed_partitions *state,
413 unsigned long base, struct ldmdb *ldb) 413 unsigned long base, struct ldmdb *ldb)
414{ 414{
415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; 415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
416 struct tocblock *tb[4]; 416 struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
420 int i, nr_tbs; 420 int i, nr_tbs;
421 bool result = false; 421 bool result = false;
422 422
423 BUG_ON(!bdev || !ldb); 423 BUG_ON(!state || !ldb);
424 ph = &ldb->ph; 424 ph = &ldb->ph;
425 tb[0] = &ldb->toc; 425 tb[0] = &ldb->toc;
426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); 426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
437 * skip any that fail as long as we get at least one valid TOCBLOCK. 437 * skip any that fail as long as we get at least one valid TOCBLOCK.
438 */ 438 */
439 for (nr_tbs = i = 0; i < 4; i++) { 439 for (nr_tbs = i = 0; i < 4; i++) {
440 data = read_dev_sector(bdev, base + off[i], &sect); 440 data = read_part_sector(state, base + off[i], &sect);
441 if (!data) { 441 if (!data) {
442 ldm_error("Disk read failed for TOCBLOCK %d.", i); 442 ldm_error("Disk read failed for TOCBLOCK %d.", i);
443 continue; 443 continue;
@@ -473,7 +473,7 @@ err:
473 473
474/** 474/**
475 * ldm_validate_vmdb - Read the VMDB and validate it 475 * ldm_validate_vmdb - Read the VMDB and validate it
476 * @bdev: Device holding the LDM Database 476 * @state: Partition check state including device holding the LDM Database
477 * @base: Offset, into @bdev, of the database 477 * @base: Offset, into @bdev, of the database
478 * @ldb: Cache of the database structures 478 * @ldb: Cache of the database structures
479 * 479 *
@@ -483,8 +483,8 @@ err:
483 * Return: 'true' @ldb contains validated VBDB info 483 * Return: 'true' @ldb contains validated VBDB info
484 * 'false' @ldb contents are undefined 484 * 'false' @ldb contents are undefined
485 */ 485 */
486static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, 486static bool ldm_validate_vmdb(struct parsed_partitions *state,
487 struct ldmdb *ldb) 487 unsigned long base, struct ldmdb *ldb)
488{ 488{
489 Sector sect; 489 Sector sect;
490 u8 *data; 490 u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
492 struct vmdb *vm; 492 struct vmdb *vm;
493 struct tocblock *toc; 493 struct tocblock *toc;
494 494
495 BUG_ON (!bdev || !ldb); 495 BUG_ON (!state || !ldb);
496 496
497 vm = &ldb->vm; 497 vm = &ldb->vm;
498 toc = &ldb->toc; 498 toc = &ldb->toc;
499 499
500 data = read_dev_sector (bdev, base + OFF_VMDB, &sect); 500 data = read_part_sector(state, base + OFF_VMDB, &sect);
501 if (!data) { 501 if (!data) {
502 ldm_crit ("Disk read failed."); 502 ldm_crit ("Disk read failed.");
503 return false; 503 return false;
@@ -534,21 +534,21 @@ out:
534 534
535/** 535/**
536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk 536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
537 * @bdev: Device holding the LDM Database 537 * @state: Partition check state including device holding the LDM Database
538 * 538 *
539 * This function provides a weak test to decide whether the device is a dynamic 539 * This function provides a weak test to decide whether the device is a dynamic
540 * disk or not. It looks for an MS-DOS-style partition table containing at 540 * disk or not. It looks for an MS-DOS-style partition table containing at
541 * least one partition of type 0x42 (formerly SFS, now used by Windows for 541 * least one partition of type 0x42 (formerly SFS, now used by Windows for
542 * dynamic disks). 542 * dynamic disks).
543 * 543 *
544 * N.B. The only possible error can come from the read_dev_sector and that is 544 * N.B. The only possible error can come from the read_part_sector and that is
545 * only likely to happen if the underlying device is strange. If that IS 545 * only likely to happen if the underlying device is strange. If that IS
546 * the case we should return zero to let someone else try. 546 * the case we should return zero to let someone else try.
547 * 547 *
548 * Return: 'true' @bdev is a dynamic disk 548 * Return: 'true' @state->bdev is a dynamic disk
549 * 'false' @bdev is not a dynamic disk, or an error occurred 549 * 'false' @state->bdev is not a dynamic disk, or an error occurred
550 */ 550 */
551static bool ldm_validate_partition_table (struct block_device *bdev) 551static bool ldm_validate_partition_table(struct parsed_partitions *state)
552{ 552{
553 Sector sect; 553 Sector sect;
554 u8 *data; 554 u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
556 int i; 556 int i;
557 bool result = false; 557 bool result = false;
558 558
559 BUG_ON (!bdev); 559 BUG_ON(!state);
560 560
561 data = read_dev_sector (bdev, 0, &sect); 561 data = read_part_sector(state, 0, &sect);
562 if (!data) { 562 if (!data) {
563 ldm_crit ("Disk read failed."); 563 ldm_crit ("Disk read failed.");
564 return false; 564 return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1391 1391
1392/** 1392/**
1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory 1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
1394 * @bdev: Device holding the LDM Database 1394 * @state: Partition check state including device holding the LDM Database
1395 * @base: Offset, into @bdev, of the database 1395 * @base: Offset, into @state->bdev, of the database
1396 * @ldb: Cache of the database structures 1396 * @ldb: Cache of the database structures
1397 * 1397 *
1398 * To use the information from the VBLKs, they need to be read from the disk, 1398 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1401 * Return: 'true' All the VBLKs were read successfully 1401 * Return: 'true' All the VBLKs were read successfully
1402 * 'false' An error occurred 1402 * 'false' An error occurred
1403 */ 1403 */
1404static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, 1404static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
1405 struct ldmdb *ldb) 1405 struct ldmdb *ldb)
1406{ 1406{
1407 int size, perbuf, skip, finish, s, v, recs; 1407 int size, perbuf, skip, finish, s, v, recs;
1408 u8 *data = NULL; 1408 u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1410 bool result = false; 1410 bool result = false;
1411 LIST_HEAD (frags); 1411 LIST_HEAD (frags);
1412 1412
1413 BUG_ON (!bdev || !ldb); 1413 BUG_ON(!state || !ldb);
1414 1414
1415 size = ldb->vm.vblk_size; 1415 size = ldb->vm.vblk_size;
1416 perbuf = 512 / size; 1416 perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1418 finish = (size * ldb->vm.last_vblk_seq) >> 9; 1418 finish = (size * ldb->vm.last_vblk_seq) >> 9;
1419 1419
1420 for (s = skip; s < finish; s++) { /* For each sector */ 1420 for (s = skip; s < finish; s++) { /* For each sector */
1421 data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect); 1421 data = read_part_sector(state, base + OFF_VMDB + s, &sect);
1422 if (!data) { 1422 if (!data) {
1423 ldm_crit ("Disk read failed."); 1423 ldm_crit ("Disk read failed.");
1424 goto out; 1424 goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
1474 1474
1475/** 1475/**
1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it 1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it
1477 * @pp: List of the partitions parsed so far 1477 * @state: Partition check state including device holding the LDM Database
1478 * @bdev: Device holding the LDM Database
1479 * 1478 *
1480 * This determines whether the device @bdev is a dynamic disk and if so creates 1479 * This determines whether the device @bdev is a dynamic disk and if so creates
1481 * the partitions necessary in the gendisk structure pointed to by @hd. 1480 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
1485 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, 1484 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
1486 * and so on: the actual data containing partitions. 1485 * and so on: the actual data containing partitions.
1487 * 1486 *
1488 * Return: 1 Success, @bdev is a dynamic disk and we handled it 1487 * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
1489 * 0 Success, @bdev is not a dynamic disk 1488 * 0 Success, @state->bdev is not a dynamic disk
1490 * -1 An error occurred before enough information had been read 1489 * -1 An error occurred before enough information had been read
1491 * Or @bdev is a dynamic disk, but it may be corrupted 1490 * Or @state->bdev is a dynamic disk, but it may be corrupted
1492 */ 1491 */
1493int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) 1492int ldm_partition(struct parsed_partitions *state)
1494{ 1493{
1495 struct ldmdb *ldb; 1494 struct ldmdb *ldb;
1496 unsigned long base; 1495 unsigned long base;
1497 int result = -1; 1496 int result = -1;
1498 1497
1499 BUG_ON (!pp || !bdev); 1498 BUG_ON(!state);
1500 1499
1501 /* Look for signs of a Dynamic Disk */ 1500 /* Look for signs of a Dynamic Disk */
1502 if (!ldm_validate_partition_table (bdev)) 1501 if (!ldm_validate_partition_table(state))
1503 return 0; 1502 return 0;
1504 1503
1505 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); 1504 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1509 } 1508 }
1510 1509
1511 /* Parse and check privheads. */ 1510 /* Parse and check privheads. */
1512 if (!ldm_validate_privheads (bdev, &ldb->ph)) 1511 if (!ldm_validate_privheads(state, &ldb->ph))
1513 goto out; /* Already logged */ 1512 goto out; /* Already logged */
1514 1513
1515 /* All further references are relative to base (database start). */ 1514 /* All further references are relative to base (database start). */
1516 base = ldb->ph.config_start; 1515 base = ldb->ph.config_start;
1517 1516
1518 /* Parse and check tocs and vmdb. */ 1517 /* Parse and check tocs and vmdb. */
1519 if (!ldm_validate_tocblocks (bdev, base, ldb) || 1518 if (!ldm_validate_tocblocks(state, base, ldb) ||
1520 !ldm_validate_vmdb (bdev, base, ldb)) 1519 !ldm_validate_vmdb(state, base, ldb))
1521 goto out; /* Already logged */ 1520 goto out; /* Already logged */
1522 1521
1523 /* Initialize vblk lists in ldmdb struct */ 1522 /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1527 INIT_LIST_HEAD (&ldb->v_comp); 1526 INIT_LIST_HEAD (&ldb->v_comp);
1528 INIT_LIST_HEAD (&ldb->v_part); 1527 INIT_LIST_HEAD (&ldb->v_part);
1529 1528
1530 if (!ldm_get_vblks (bdev, base, ldb)) { 1529 if (!ldm_get_vblks(state, base, ldb)) {
1531 ldm_crit ("Failed to read the VBLKs from the database."); 1530 ldm_crit ("Failed to read the VBLKs from the database.");
1532 goto cleanup; 1531 goto cleanup;
1533 } 1532 }
1534 1533
1535 /* Finally, create the data partition devices. */ 1534 /* Finally, create the data partition devices. */
1536 if (ldm_create_data_partitions (pp, ldb)) { 1535 if (ldm_create_data_partitions(state, ldb)) {
1537 ldm_debug ("Parsed LDM database successfully."); 1536 ldm_debug ("Parsed LDM database successfully.");
1538 result = 1; 1537 result = 1;
1539 } 1538 }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */
209 struct list_head v_part; 209 struct list_head v_part;
210}; 210};
211 211
212int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); 212int ldm_partition(struct parsed_partitions *state);
213 213
214#endif /* _FS_PT_LDM_H_ */ 214#endif /* _FS_PT_LDM_H_ */
215 215
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..13e27b0082f2 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
27 stg[i] = 0; 27 stg[i] = 0;
28} 28}
29 29
30int mac_partition(struct parsed_partitions *state, struct block_device *bdev) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1; 32 int slot = 1;
33 Sector sect; 33 Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
42 struct mac_driver_desc *md; 42 struct mac_driver_desc *md;
43 43
44 /* Get 0th block and look at the first partition map entry. */ 44 /* Get 0th block and look at the first partition map entry. */
45 md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect); 45 md = read_part_sector(state, 0, &sect);
46 if (!md) 46 if (!md)
47 return -1; 47 return -1;
48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { 48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
51 } 51 }
52 secsize = be16_to_cpu(md->block_size); 52 secsize = be16_to_cpu(md->block_size);
53 put_dev_sector(sect); 53 put_dev_sector(sect);
54 data = read_dev_sector(bdev, secsize/512, &sect); 54 data = read_part_sector(state, secsize/512, &sect);
55 if (!data) 55 if (!data)
56 return -1; 56 return -1;
57 part = (struct mac_partition *) (data + secsize%512); 57 part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
66 put_dev_sector(sect); 66 put_dev_sector(sect);
67 data = read_dev_sector(bdev, pos/512, &sect); 67 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 68 if (!data)
69 return -1; 69 return -1;
70 part = (struct mac_partition *) (data + pos%512); 70 part = (struct mac_partition *) (data + pos%512);
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
123 } 123 }
124#ifdef CONFIG_PPC_PMAC 124#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 125 if (found_root_goodness)
126 note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); 126 note_bootable_part(state->bdev->bd_dev, found_root,
127 found_root_goodness);
127#endif 128#endif
128 129
129 put_dev_sector(sect); 130 put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
41 /* ... more stuff */ 41 /* ... more stuff */
42}; 42};
43 43
44int mac_partition(struct parsed_partitions *state, struct block_device *bdev); 44int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..645a68d8c055 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
64#define AIX_LABEL_MAGIC2 0xC2 64#define AIX_LABEL_MAGIC2 0xC2
65#define AIX_LABEL_MAGIC3 0xD4 65#define AIX_LABEL_MAGIC3 0xD4
66#define AIX_LABEL_MAGIC4 0xC1 66#define AIX_LABEL_MAGIC4 0xC1
67static int aix_magic_present(unsigned char *p, struct block_device *bdev) 67static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
68{ 68{
69 struct partition *pt = (struct partition *) (p + 0x1be); 69 struct partition *pt = (struct partition *) (p + 0x1be);
70 Sector sect; 70 Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
85 is_extended_partition(pt)) 85 is_extended_partition(pt))
86 return 0; 86 return 0;
87 } 87 }
88 d = read_dev_sector(bdev, 7, &sect); 88 d = read_part_sector(state, 7, &sect);
89 if (d) { 89 if (d) {
90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') 90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
91 ret = 1; 91 ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
105 * only for the actual data partitions. 105 * only for the actual data partitions.
106 */ 106 */
107 107
108static void 108static void parse_extended(struct parsed_partitions *state,
109parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109 sector_t first_sector, sector_t first_size)
110 sector_t first_sector, sector_t first_size)
111{ 110{
112 struct partition *p; 111 struct partition *p;
113 Sector sect; 112 Sector sect;
114 unsigned char *data; 113 unsigned char *data;
115 sector_t this_sector, this_size; 114 sector_t this_sector, this_size;
116 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 115 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
117 int loopct = 0; /* number of links followed 116 int loopct = 0; /* number of links followed
118 without finding a data partition */ 117 without finding a data partition */
119 int i; 118 int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
126 return; 125 return;
127 if (state->next == state->limit) 126 if (state->next == state->limit)
128 return; 127 return;
129 data = read_dev_sector(bdev, this_sector, &sect); 128 data = read_part_sector(state, this_sector, &sect);
130 if (!data) 129 if (!data)
131 return; 130 return;
132 131
@@ -198,9 +197,8 @@ done:
198/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also 197/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
199 indicates linux swap. Be careful before believing this is Solaris. */ 198 indicates linux swap. Be careful before believing this is Solaris. */
200 199
201static void 200static void parse_solaris_x86(struct parsed_partitions *state,
202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 201 sector_t offset, sector_t size, int origin)
203 sector_t offset, sector_t size, int origin)
204{ 202{
205#ifdef CONFIG_SOLARIS_X86_PARTITION 203#ifdef CONFIG_SOLARIS_X86_PARTITION
206 Sector sect; 204 Sector sect;
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
208 int i; 206 int i;
209 short max_nparts; 207 short max_nparts;
210 208
211 v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect); 209 v = read_part_sector(state, offset + 1, &sect);
212 if (!v) 210 if (!v)
213 return; 211 return;
214 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { 212 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
245 * Create devices for BSD partitions listed in a disklabel, under a 243 * Create devices for BSD partitions listed in a disklabel, under a
246 * dos-like partition. See parse_extended() for more information. 244 * dos-like partition. See parse_extended() for more information.
247 */ 245 */
248static void 246static void parse_bsd(struct parsed_partitions *state,
249parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 247 sector_t offset, sector_t size, int origin, char *flavour,
250 sector_t offset, sector_t size, int origin, char *flavour, 248 int max_partitions)
251 int max_partitions)
252{ 249{
253 Sector sect; 250 Sector sect;
254 struct bsd_disklabel *l; 251 struct bsd_disklabel *l;
255 struct bsd_partition *p; 252 struct bsd_partition *p;
256 253
257 l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect); 254 l = read_part_sector(state, offset + 1, &sect);
258 if (!l) 255 if (!l)
259 return; 256 return;
260 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { 257 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
291} 288}
292#endif 289#endif
293 290
294static void 291static void parse_freebsd(struct parsed_partitions *state,
295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 292 sector_t offset, sector_t size, int origin)
296 sector_t offset, sector_t size, int origin)
297{ 293{
298#ifdef CONFIG_BSD_DISKLABEL 294#ifdef CONFIG_BSD_DISKLABEL
299 parse_bsd(state, bdev, offset, size, origin, 295 parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
300 "bsd", BSD_MAXPARTITIONS);
301#endif 296#endif
302} 297}
303 298
304static void 299static void parse_netbsd(struct parsed_partitions *state,
305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 300 sector_t offset, sector_t size, int origin)
306 sector_t offset, sector_t size, int origin)
307{ 301{
308#ifdef CONFIG_BSD_DISKLABEL 302#ifdef CONFIG_BSD_DISKLABEL
309 parse_bsd(state, bdev, offset, size, origin, 303 parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
310 "netbsd", BSD_MAXPARTITIONS);
311#endif 304#endif
312} 305}
313 306
314static void 307static void parse_openbsd(struct parsed_partitions *state,
315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 308 sector_t offset, sector_t size, int origin)
316 sector_t offset, sector_t size, int origin)
317{ 309{
318#ifdef CONFIG_BSD_DISKLABEL 310#ifdef CONFIG_BSD_DISKLABEL
319 parse_bsd(state, bdev, offset, size, origin, 311 parse_bsd(state, offset, size, origin, "openbsd",
320 "openbsd", OPENBSD_MAXPARTITIONS); 312 OPENBSD_MAXPARTITIONS);
321#endif 313#endif
322} 314}
323 315
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
325 * Create devices for Unixware partitions listed in a disklabel, under a 317 * Create devices for Unixware partitions listed in a disklabel, under a
326 * dos-like partition. See parse_extended() for more information. 318 * dos-like partition. See parse_extended() for more information.
327 */ 319 */
328static void 320static void parse_unixware(struct parsed_partitions *state,
329parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 321 sector_t offset, sector_t size, int origin)
330 sector_t offset, sector_t size, int origin)
331{ 322{
332#ifdef CONFIG_UNIXWARE_DISKLABEL 323#ifdef CONFIG_UNIXWARE_DISKLABEL
333 Sector sect; 324 Sector sect;
334 struct unixware_disklabel *l; 325 struct unixware_disklabel *l;
335 struct unixware_slice *p; 326 struct unixware_slice *p;
336 327
337 l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect); 328 l = read_part_sector(state, offset + 29, &sect);
338 if (!l) 329 if (!l)
339 return; 330 return;
340 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || 331 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
365 * Anand Krishnamurthy <anandk@wiproge.med.ge.com> 356 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
366 * Rajeev V. Pillai <rajeevvp@yahoo.com> 357 * Rajeev V. Pillai <rajeevvp@yahoo.com>
367 */ 358 */
368static void 359static void parse_minix(struct parsed_partitions *state,
369parse_minix(struct parsed_partitions *state, struct block_device *bdev, 360 sector_t offset, sector_t size, int origin)
370 sector_t offset, sector_t size, int origin)
371{ 361{
372#ifdef CONFIG_MINIX_SUBPARTITION 362#ifdef CONFIG_MINIX_SUBPARTITION
373 Sector sect; 363 Sector sect;
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
375 struct partition *p; 365 struct partition *p;
376 int i; 366 int i;
377 367
378 data = read_dev_sector(bdev, offset, &sect); 368 data = read_part_sector(state, offset, &sect);
379 if (!data) 369 if (!data)
380 return; 370 return;
381 371
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
404 394
405static struct { 395static struct {
406 unsigned char id; 396 unsigned char id;
407 void (*parse)(struct parsed_partitions *, struct block_device *, 397 void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
408 sector_t, sector_t, int);
409} subtypes[] = { 398} subtypes[] = {
410 {FREEBSD_PARTITION, parse_freebsd}, 399 {FREEBSD_PARTITION, parse_freebsd},
411 {NETBSD_PARTITION, parse_netbsd}, 400 {NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +406,16 @@ static struct {
417 {0, NULL}, 406 {0, NULL},
418}; 407};
419 408
420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 409int msdos_partition(struct parsed_partitions *state)
421{ 410{
422 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 411 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
423 Sector sect; 412 Sector sect;
424 unsigned char *data; 413 unsigned char *data;
425 struct partition *p; 414 struct partition *p;
426 struct fat_boot_sector *fb; 415 struct fat_boot_sector *fb;
427 int slot; 416 int slot;
428 417
429 data = read_dev_sector(bdev, 0, &sect); 418 data = read_part_sector(state, 0, &sect);
430 if (!data) 419 if (!data)
431 return -1; 420 return -1;
432 if (!msdos_magic_present(data + 510)) { 421 if (!msdos_magic_present(data + 510)) {
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
434 return 0; 423 return 0;
435 } 424 }
436 425
437 if (aix_magic_present(data, bdev)) { 426 if (aix_magic_present(state, data)) {
438 put_dev_sector(sect); 427 put_dev_sector(sect);
439 printk( " [AIX]"); 428 printk( " [AIX]");
440 return 0; 429 return 0;
@@ -503,7 +492,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
503 put_partition(state, slot, start, n); 492 put_partition(state, slot, start, n);
504 493
505 printk(" <"); 494 printk(" <");
506 parse_extended(state, bdev, start, size); 495 parse_extended(state, start, size);
507 printk(" >"); 496 printk(" >");
508 continue; 497 continue;
509 } 498 }
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
532 521
533 if (!subtypes[n].parse) 522 if (!subtypes[n].parse)
534 continue; 523 continue;
535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size, 524 subtypes[n].parse(state, start_sect(p) * sector_size,
536 nr_sects(p)*sector_size, slot); 525 nr_sects(p) * sector_size, slot);
537 } 526 }
538 put_dev_sector(sect); 527 put_dev_sector(sect);
539 return 1; 528 return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
4 4
5#define MSDOS_LABEL_MAGIC 0xAA55 5#define MSDOS_LABEL_MAGIC 0xAA55
6 6
7int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); 7int msdos_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13int osf_partition(struct parsed_partitions *state, struct block_device *bdev) 13int osf_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 int slot = 1; 16 int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
49 } * label; 49 } * label;
50 struct d_partition * partition; 50 struct d_partition * partition;
51 51
52 data = read_dev_sector(bdev, 0, &sect); 52 data = read_part_sector(state, 0, &sect);
53 if (!data) 53 if (!data)
54 return -1; 54 return -1;
55 55
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
4 4
5#define DISKLABELMAGIC (0x82564557UL) 5#define DISKLABELMAGIC (0x82564557UL)
6 6
7int osf_partition(struct parsed_partitions *state, struct block_device *bdev); 7int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
27 __be32 _unused1; /* Padding */ 27 __be32 _unused1; /* Padding */
28}; 28};
29 29
30int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) 30int sgi_partition(struct parsed_partitions *state)
31{ 31{
32 int i, csum; 32 int i, csum;
33 __be32 magic; 33 __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
39 struct sgi_partition *p; 39 struct sgi_partition *p;
40 char b[BDEVNAME_SIZE]; 40 char b[BDEVNAME_SIZE];
41 41
42 label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect); 42 label = read_part_sector(state, 0, &sect);
43 if (!label) 43 if (!label)
44 return -1; 44 return -1;
45 p = &label->partitions[0]; 45 p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
57 } 57 }
58 if(csum) { 58 if(csum) {
59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", 59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
60 bdevname(bdev, b)); 60 bdevname(state->bdev, b));
61 put_dev_sector(sect); 61 put_dev_sector(sect);
62 return 0; 62 return 0;
63 } 63 }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
2 * fs/partitions/sgi.h 2 * fs/partitions/sgi.h
3 */ 3 */
4 4
5extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); 5extern int sgi_partition(struct parsed_partitions *state);
6 6
7#define SGI_LABEL_MAGIC 0x0be5a941 7#define SGI_LABEL_MAGIC 0x0be5a941
8 8
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "sun.h" 11#include "sun.h"
12 12
13int sun_partition(struct parsed_partitions *state, struct block_device *bdev) 13int sun_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 __be16 csum; 16 __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
61 int use_vtoc; 61 int use_vtoc;
62 int nparts; 62 int nparts;
63 63
64 label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect); 64 label = read_part_sector(state, 0, &sect);
65 if (!label) 65 if (!label)
66 return -1; 66 return -1;
67 67
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
78 csum ^= *ush--; 78 csum ^= *ush--;
79 if (csum) { 79 if (csum) {
80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", 80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
81 bdevname(bdev, b)); 81 bdevname(state->bdev, b));
82 put_dev_sector(sect); 82 put_dev_sector(sect);
83 return 0; 83 return 0;
84 } 84 }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
5#define SUN_LABEL_MAGIC 0xDABE 5#define SUN_LABEL_MAGIC 0xDABE
6#define SUN_VTOC_SANITY 0x600DDEEE 6#define SUN_VTOC_SANITY 0x600DDEEE
7 7
8int sun_partition(struct parsed_partitions *state, struct block_device *bdev); 8int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
46}; 46};
47 47
48 48
49int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) 49int sysv68_partition(struct parsed_partitions *state)
50{ 50{
51 int i, slices; 51 int i, slices;
52 int slot = 1; 52 int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 57
58 data = read_dev_sector(bdev, 0, &sect); 58 data = read_part_sector(state, 0, &sect);
59 if (!data) 59 if (!data)
60 return -1; 60 return -1;
61 61
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
68 i = be32_to_cpu(b->dk_ios.ios_slcblk); 68 i = be32_to_cpu(b->dk_ios.ios_slcblk);
69 put_dev_sector(sect); 69 put_dev_sector(sect);
70 70
71 data = read_dev_sector(bdev, i, &sect); 71 data = read_part_sector(state, i, &sect);
72 if (!data) 72 if (!data)
73 return -1; 73 return -1;
74 74
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "ultrix.h" 10#include "ultrix.h"
11 11
12int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) 12int ultrix_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 Sector sect; 15 Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
26#define PT_MAGIC 0x032957 /* Partition magic number */ 26#define PT_MAGIC 0x032957 /* Partition magic number */
27#define PT_VALID 1 /* Indicates if struct is valid */ 27#define PT_VALID 1 /* Indicates if struct is valid */
28 28
29 data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect); 29 data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
30 if (!data) 30 if (!data)
31 return -1; 31 return -1;
32 32
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
2 * fs/partitions/ultrix.h 2 * fs/partitions/ultrix.h
3 */ 3 */
4 4
5int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); 5int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..d79872eba09a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/log2.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/pipe_fs_i.h> 16#include <linux/pipe_fs_i.h>
16#include <linux/uio.h> 17#include <linux/uio.h>
@@ -18,11 +19,18 @@
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/audit.h> 20#include <linux/audit.h>
20#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/fcntl.h>
21 23
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <asm/ioctls.h> 25#include <asm/ioctls.h>
24 26
25/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-pages
30 */
31unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
32
33/*
26 * We use a start+len construction, which provides full use of the 34 * We use a start+len construction, which provides full use of the
27 * allocated memory. 35 * allocated memory.
28 * -- Florian Coosmann (FGC) 36 * -- Florian Coosmann (FGC)
@@ -390,7 +398,7 @@ redo:
390 if (!buf->len) { 398 if (!buf->len) {
391 buf->ops = NULL; 399 buf->ops = NULL;
392 ops->release(pipe, buf); 400 ops->release(pipe, buf);
393 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 401 curbuf = (curbuf + 1) & (pipe->buffers - 1);
394 pipe->curbuf = curbuf; 402 pipe->curbuf = curbuf;
395 pipe->nrbufs = --bufs; 403 pipe->nrbufs = --bufs;
396 do_wakeup = 1; 404 do_wakeup = 1;
@@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
472 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 480 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
473 if (pipe->nrbufs && chars != 0) { 481 if (pipe->nrbufs && chars != 0) {
474 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 482 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
475 (PIPE_BUFFERS-1); 483 (pipe->buffers - 1);
476 struct pipe_buffer *buf = pipe->bufs + lastbuf; 484 struct pipe_buffer *buf = pipe->bufs + lastbuf;
477 const struct pipe_buf_operations *ops = buf->ops; 485 const struct pipe_buf_operations *ops = buf->ops;
478 int offset = buf->offset + buf->len; 486 int offset = buf->offset + buf->len;
@@ -518,8 +526,8 @@ redo1:
518 break; 526 break;
519 } 527 }
520 bufs = pipe->nrbufs; 528 bufs = pipe->nrbufs;
521 if (bufs < PIPE_BUFFERS) { 529 if (bufs < pipe->buffers) {
522 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 530 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
523 struct pipe_buffer *buf = pipe->bufs + newbuf; 531 struct pipe_buffer *buf = pipe->bufs + newbuf;
524 struct page *page = pipe->tmp_page; 532 struct page *page = pipe->tmp_page;
525 char *src; 533 char *src;
@@ -580,7 +588,7 @@ redo2:
580 if (!total_len) 588 if (!total_len)
581 break; 589 break;
582 } 590 }
583 if (bufs < PIPE_BUFFERS) 591 if (bufs < pipe->buffers)
584 continue; 592 continue;
585 if (filp->f_flags & O_NONBLOCK) { 593 if (filp->f_flags & O_NONBLOCK) {
586 if (!ret) 594 if (!ret)
@@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
640 nrbufs = pipe->nrbufs; 648 nrbufs = pipe->nrbufs;
641 while (--nrbufs >= 0) { 649 while (--nrbufs >= 0) {
642 count += pipe->bufs[buf].len; 650 count += pipe->bufs[buf].len;
643 buf = (buf+1) & (PIPE_BUFFERS-1); 651 buf = (buf+1) & (pipe->buffers - 1);
644 } 652 }
645 mutex_unlock(&inode->i_mutex); 653 mutex_unlock(&inode->i_mutex);
646 654
@@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait)
671 } 679 }
672 680
673 if (filp->f_mode & FMODE_WRITE) { 681 if (filp->f_mode & FMODE_WRITE) {
674 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 682 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
675 /* 683 /*
676 * Most Unices do not set POLLERR for FIFOs but on Linux they 684 * Most Unices do not set POLLERR for FIFOs but on Linux they
677 * behave exactly like pipes for poll(). 685 * behave exactly like pipes for poll().
@@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
877 885
878 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 886 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
879 if (pipe) { 887 if (pipe) {
880 init_waitqueue_head(&pipe->wait); 888 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
881 pipe->r_counter = pipe->w_counter = 1; 889 if (pipe->bufs) {
882 pipe->inode = inode; 890 init_waitqueue_head(&pipe->wait);
891 pipe->r_counter = pipe->w_counter = 1;
892 pipe->inode = inode;
893 pipe->buffers = PIPE_DEF_BUFFERS;
894 return pipe;
895 }
896 kfree(pipe);
883 } 897 }
884 898
885 return pipe; 899 return NULL;
886} 900}
887 901
888void __free_pipe_info(struct pipe_inode_info *pipe) 902void __free_pipe_info(struct pipe_inode_info *pipe)
889{ 903{
890 int i; 904 int i;
891 905
892 for (i = 0; i < PIPE_BUFFERS; i++) { 906 for (i = 0; i < pipe->buffers; i++) {
893 struct pipe_buffer *buf = pipe->bufs + i; 907 struct pipe_buffer *buf = pipe->bufs + i;
894 if (buf->ops) 908 if (buf->ops)
895 buf->ops->release(pipe, buf); 909 buf->ops->release(pipe, buf);
896 } 910 }
897 if (pipe->tmp_page) 911 if (pipe->tmp_page)
898 __free_page(pipe->tmp_page); 912 __free_page(pipe->tmp_page);
913 kfree(pipe->bufs);
899 kfree(pipe); 914 kfree(pipe);
900} 915}
901 916
@@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1094} 1109}
1095 1110
1096/* 1111/*
1112 * Allocate a new array of pipe buffers and copy the info over. Returns the
1113 * pipe size if successful, or return -ERROR on error.
1114 */
1115static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1116{
1117 struct pipe_buffer *bufs;
1118
1119 /*
1120 * Must be a power-of-2 currently
1121 */
1122 if (!is_power_of_2(arg))
1123 return -EINVAL;
1124
1125 /*
1126 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1127 * expect a lot of shrink+grow operations, just free and allocate
1128 * again like we would do for growing. If the pipe currently
1129 * contains more buffers than arg, then return busy.
1130 */
1131 if (arg < pipe->nrbufs)
1132 return -EBUSY;
1133
1134 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
1135 if (unlikely(!bufs))
1136 return -ENOMEM;
1137
1138 /*
1139 * The pipe array wraps around, so just start the new one at zero
1140 * and adjust the indexes.
1141 */
1142 if (pipe->nrbufs) {
1143 const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
1144 const unsigned int head = pipe->nrbufs - tail;
1145
1146 if (head)
1147 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1148 if (tail)
1149 memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
1150 }
1151
1152 pipe->curbuf = 0;
1153 kfree(pipe->bufs);
1154 pipe->bufs = bufs;
1155 pipe->buffers = arg;
1156 return arg;
1157}
1158
1159long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1160{
1161 struct pipe_inode_info *pipe;
1162 long ret;
1163
1164 pipe = file->f_path.dentry->d_inode->i_pipe;
1165 if (!pipe)
1166 return -EBADF;
1167
1168 mutex_lock(&pipe->inode->i_mutex);
1169
1170 switch (cmd) {
1171 case F_SETPIPE_SZ:
1172 if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
1173 return -EINVAL;
1174 /*
1175 * The pipe needs to be at least 2 pages large to
1176 * guarantee POSIX behaviour.
1177 */
1178 if (arg < 2)
1179 return -EINVAL;
1180 ret = pipe_set_size(pipe, arg);
1181 break;
1182 case F_GETPIPE_SZ:
1183 ret = pipe->buffers;
1184 break;
1185 default:
1186 ret = -EINVAL;
1187 break;
1188 }
1189
1190 mutex_unlock(&pipe->inode->i_mutex);
1191 return ret;
1192}
1193
1194/*
1097 * pipefs should _never_ be mounted by userland - too much of security hassle, 1195 * pipefs should _never_ be mounted by userland - too much of security hassle,
1098 * no real gain from having the whole whorehouse mounted. So we don't need 1196 * no real gain from having the whole whorehouse mounted. So we don't need
1099 * any operations on the root directory. However, we need a non-trivial 1197 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
147 barrier_done = reiserfs_commit_for_inode(inode); 147 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(inode->i_sb); 148 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 150 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
151 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 152 if (barrier_done < 0)
152 return barrier_done; 153 return barrier_done;
153 return (err < 0) ? -EIO : 0; 154 return (err < 0) ? -EIO : 0;
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
193 break; 193 break;
194 } 194 }
195 195
196 if (pipe->nrbufs < PIPE_BUFFERS) { 196 if (pipe->nrbufs < pipe->buffers) {
197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
198 struct pipe_buffer *buf = pipe->bufs + newbuf; 198 struct pipe_buffer *buf = pipe->bufs + newbuf;
199 199
200 buf->page = spd->pages[page_nr]; 200 buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
214 214
215 if (!--spd->nr_pages) 215 if (!--spd->nr_pages)
216 break; 216 break;
217 if (pipe->nrbufs < PIPE_BUFFERS) 217 if (pipe->nrbufs < pipe->buffers)
218 continue; 218 continue;
219 219
220 break; 220 break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
265 page_cache_release(spd->pages[i]); 265 page_cache_release(spd->pages[i]);
266} 266}
267 267
268/*
269 * Check if we need to grow the arrays holding pages and partial page
270 * descriptions.
271 */
272int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
273{
274 if (pipe->buffers <= PIPE_DEF_BUFFERS)
275 return 0;
276
277 spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
278 spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
279
280 if (spd->pages && spd->partial)
281 return 0;
282
283 kfree(spd->pages);
284 kfree(spd->partial);
285 return -ENOMEM;
286}
287
288void splice_shrink_spd(struct pipe_inode_info *pipe,
289 struct splice_pipe_desc *spd)
290{
291 if (pipe->buffers <= PIPE_DEF_BUFFERS)
292 return;
293
294 kfree(spd->pages);
295 kfree(spd->partial);
296}
297
268static int 298static int
269__generic_file_splice_read(struct file *in, loff_t *ppos, 299__generic_file_splice_read(struct file *in, loff_t *ppos,
270 struct pipe_inode_info *pipe, size_t len, 300 struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
272{ 302{
273 struct address_space *mapping = in->f_mapping; 303 struct address_space *mapping = in->f_mapping;
274 unsigned int loff, nr_pages, req_pages; 304 unsigned int loff, nr_pages, req_pages;
275 struct page *pages[PIPE_BUFFERS]; 305 struct page *pages[PIPE_DEF_BUFFERS];
276 struct partial_page partial[PIPE_BUFFERS]; 306 struct partial_page partial[PIPE_DEF_BUFFERS];
277 struct page *page; 307 struct page *page;
278 pgoff_t index, end_index; 308 pgoff_t index, end_index;
279 loff_t isize; 309 loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
286 .spd_release = spd_release_page, 316 .spd_release = spd_release_page,
287 }; 317 };
288 318
319 if (splice_grow_spd(pipe, &spd))
320 return -ENOMEM;
321
289 index = *ppos >> PAGE_CACHE_SHIFT; 322 index = *ppos >> PAGE_CACHE_SHIFT;
290 loff = *ppos & ~PAGE_CACHE_MASK; 323 loff = *ppos & ~PAGE_CACHE_MASK;
291 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 324 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
292 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 325 nr_pages = min(req_pages, pipe->buffers);
293 326
294 /* 327 /*
295 * Lookup the (hopefully) full range of pages we need. 328 * Lookup the (hopefully) full range of pages we need.
296 */ 329 */
297 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 330 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
298 index += spd.nr_pages; 331 index += spd.nr_pages;
299 332
300 /* 333 /*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
335 unlock_page(page); 368 unlock_page(page);
336 } 369 }
337 370
338 pages[spd.nr_pages++] = page; 371 spd.pages[spd.nr_pages++] = page;
339 index++; 372 index++;
340 } 373 }
341 374
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
356 * this_len is the max we'll use from this page 389 * this_len is the max we'll use from this page
357 */ 390 */
358 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 391 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
359 page = pages[page_nr]; 392 page = spd.pages[page_nr];
360 393
361 if (PageReadahead(page)) 394 if (PageReadahead(page))
362 page_cache_async_readahead(mapping, &in->f_ra, in, 395 page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
393 error = -ENOMEM; 426 error = -ENOMEM;
394 break; 427 break;
395 } 428 }
396 page_cache_release(pages[page_nr]); 429 page_cache_release(spd.pages[page_nr]);
397 pages[page_nr] = page; 430 spd.pages[page_nr] = page;
398 } 431 }
399 /* 432 /*
400 * page was already under io and is now done, great 433 * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
451 len = this_len; 484 len = this_len;
452 } 485 }
453 486
454 partial[page_nr].offset = loff; 487 spd.partial[page_nr].offset = loff;
455 partial[page_nr].len = this_len; 488 spd.partial[page_nr].len = this_len;
456 len -= this_len; 489 len -= this_len;
457 loff = 0; 490 loff = 0;
458 spd.nr_pages++; 491 spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
464 * we got, 'nr_pages' is how many pages are in the map. 497 * we got, 'nr_pages' is how many pages are in the map.
465 */ 498 */
466 while (page_nr < nr_pages) 499 while (page_nr < nr_pages)
467 page_cache_release(pages[page_nr++]); 500 page_cache_release(spd.pages[page_nr++]);
468 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 501 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
469 502
470 if (spd.nr_pages) 503 if (spd.nr_pages)
471 return splice_to_pipe(pipe, &spd); 504 error = splice_to_pipe(pipe, &spd);
472 505
506 splice_shrink_spd(pipe, &spd);
473 return error; 507 return error;
474} 508}
475 509
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
560 unsigned int nr_pages; 594 unsigned int nr_pages;
561 unsigned int nr_freed; 595 unsigned int nr_freed;
562 size_t offset; 596 size_t offset;
563 struct page *pages[PIPE_BUFFERS]; 597 struct page *pages[PIPE_DEF_BUFFERS];
564 struct partial_page partial[PIPE_BUFFERS]; 598 struct partial_page partial[PIPE_DEF_BUFFERS];
565 struct iovec vec[PIPE_BUFFERS]; 599 struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
566 pgoff_t index; 600 pgoff_t index;
567 ssize_t res; 601 ssize_t res;
568 size_t this_len; 602 size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
576 .spd_release = spd_release_page, 610 .spd_release = spd_release_page,
577 }; 611 };
578 612
613 if (splice_grow_spd(pipe, &spd))
614 return -ENOMEM;
615
616 res = -ENOMEM;
617 vec = __vec;
618 if (pipe->buffers > PIPE_DEF_BUFFERS) {
619 vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
620 if (!vec)
621 goto shrink_ret;
622 }
623
579 index = *ppos >> PAGE_CACHE_SHIFT; 624 index = *ppos >> PAGE_CACHE_SHIFT;
580 offset = *ppos & ~PAGE_CACHE_MASK; 625 offset = *ppos & ~PAGE_CACHE_MASK;
581 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 626 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
582 627
583 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { 628 for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
584 struct page *page; 629 struct page *page;
585 630
586 page = alloc_page(GFP_USER); 631 page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
591 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 636 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
592 vec[i].iov_base = (void __user *) page_address(page); 637 vec[i].iov_base = (void __user *) page_address(page);
593 vec[i].iov_len = this_len; 638 vec[i].iov_len = this_len;
594 pages[i] = page; 639 spd.pages[i] = page;
595 spd.nr_pages++; 640 spd.nr_pages++;
596 len -= this_len; 641 len -= this_len;
597 offset = 0; 642 offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
610 nr_freed = 0; 655 nr_freed = 0;
611 for (i = 0; i < spd.nr_pages; i++) { 656 for (i = 0; i < spd.nr_pages; i++) {
612 this_len = min_t(size_t, vec[i].iov_len, res); 657 this_len = min_t(size_t, vec[i].iov_len, res);
613 partial[i].offset = 0; 658 spd.partial[i].offset = 0;
614 partial[i].len = this_len; 659 spd.partial[i].len = this_len;
615 if (!this_len) { 660 if (!this_len) {
616 __free_page(pages[i]); 661 __free_page(spd.pages[i]);
617 pages[i] = NULL; 662 spd.pages[i] = NULL;
618 nr_freed++; 663 nr_freed++;
619 } 664 }
620 res -= this_len; 665 res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
625 if (res > 0) 670 if (res > 0)
626 *ppos += res; 671 *ppos += res;
627 672
673shrink_ret:
674 if (vec != __vec)
675 kfree(vec);
676 splice_shrink_spd(pipe, &spd);
628 return res; 677 return res;
629 678
630err: 679err:
631 for (i = 0; i < spd.nr_pages; i++) 680 for (i = 0; i < spd.nr_pages; i++)
632 __free_page(pages[i]); 681 __free_page(spd.pages[i]);
633 682
634 return error; 683 res = error;
684 goto shrink_ret;
635} 685}
636EXPORT_SYMBOL(default_file_splice_read); 686EXPORT_SYMBOL(default_file_splice_read);
637 687
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
784 if (!buf->len) { 834 if (!buf->len) {
785 buf->ops = NULL; 835 buf->ops = NULL;
786 ops->release(pipe, buf); 836 ops->release(pipe, buf);
787 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 837 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
788 pipe->nrbufs--; 838 pipe->nrbufs--;
789 if (pipe->inode) 839 if (pipe->inode)
790 sd->need_wakeup = true; 840 sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
1211 * If we did an incomplete transfer we must release 1261 * If we did an incomplete transfer we must release
1212 * the pipe buffers in question: 1262 * the pipe buffers in question:
1213 */ 1263 */
1214 for (i = 0; i < PIPE_BUFFERS; i++) { 1264 for (i = 0; i < pipe->buffers; i++) {
1215 struct pipe_buffer *buf = pipe->bufs + i; 1265 struct pipe_buffer *buf = pipe->bufs + i;
1216 1266
1217 if (buf->ops) { 1267 if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1371 */ 1421 */
1372static int get_iovec_page_array(const struct iovec __user *iov, 1422static int get_iovec_page_array(const struct iovec __user *iov,
1373 unsigned int nr_vecs, struct page **pages, 1423 unsigned int nr_vecs, struct page **pages,
1374 struct partial_page *partial, int aligned) 1424 struct partial_page *partial, int aligned,
1425 unsigned int pipe_buffers)
1375{ 1426{
1376 int buffers = 0, error = 0; 1427 int buffers = 0, error = 0;
1377 1428
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1414 break; 1465 break;
1415 1466
1416 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1467 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1417 if (npages > PIPE_BUFFERS - buffers) 1468 if (npages > pipe_buffers - buffers)
1418 npages = PIPE_BUFFERS - buffers; 1469 npages = pipe_buffers - buffers;
1419 1470
1420 error = get_user_pages_fast((unsigned long)base, npages, 1471 error = get_user_pages_fast((unsigned long)base, npages,
1421 0, &pages[buffers]); 1472 0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1450 * or if we mapped the max number of pages that we have 1501 * or if we mapped the max number of pages that we have
1451 * room for. 1502 * room for.
1452 */ 1503 */
1453 if (error < npages || buffers == PIPE_BUFFERS) 1504 if (error < npages || buffers == pipe_buffers)
1454 break; 1505 break;
1455 1506
1456 nr_vecs--; 1507 nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1593 unsigned long nr_segs, unsigned int flags) 1644 unsigned long nr_segs, unsigned int flags)
1594{ 1645{
1595 struct pipe_inode_info *pipe; 1646 struct pipe_inode_info *pipe;
1596 struct page *pages[PIPE_BUFFERS]; 1647 struct page *pages[PIPE_DEF_BUFFERS];
1597 struct partial_page partial[PIPE_BUFFERS]; 1648 struct partial_page partial[PIPE_DEF_BUFFERS];
1598 struct splice_pipe_desc spd = { 1649 struct splice_pipe_desc spd = {
1599 .pages = pages, 1650 .pages = pages,
1600 .partial = partial, 1651 .partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1602 .ops = &user_page_pipe_buf_ops, 1653 .ops = &user_page_pipe_buf_ops,
1603 .spd_release = spd_release_page, 1654 .spd_release = spd_release_page,
1604 }; 1655 };
1656 long ret;
1605 1657
1606 pipe = pipe_info(file->f_path.dentry->d_inode); 1658 pipe = pipe_info(file->f_path.dentry->d_inode);
1607 if (!pipe) 1659 if (!pipe)
1608 return -EBADF; 1660 return -EBADF;
1609 1661
1610 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1662 if (splice_grow_spd(pipe, &spd))
1611 flags & SPLICE_F_GIFT); 1663 return -ENOMEM;
1664
1665 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1666 spd.partial, flags & SPLICE_F_GIFT,
1667 pipe->buffers);
1612 if (spd.nr_pages <= 0) 1668 if (spd.nr_pages <= 0)
1613 return spd.nr_pages; 1669 ret = spd.nr_pages;
1670 else
1671 ret = splice_to_pipe(pipe, &spd);
1614 1672
1615 return splice_to_pipe(pipe, &spd); 1673 splice_shrink_spd(pipe, &spd);
1674 return ret;
1616} 1675}
1617 1676
1618/* 1677/*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1738 * Check ->nrbufs without the inode lock first. This function 1797 * Check ->nrbufs without the inode lock first. This function
1739 * is speculative anyways, so missing one is ok. 1798 * is speculative anyways, so missing one is ok.
1740 */ 1799 */
1741 if (pipe->nrbufs < PIPE_BUFFERS) 1800 if (pipe->nrbufs < pipe->buffers)
1742 return 0; 1801 return 0;
1743 1802
1744 ret = 0; 1803 ret = 0;
1745 pipe_lock(pipe); 1804 pipe_lock(pipe);
1746 1805
1747 while (pipe->nrbufs >= PIPE_BUFFERS) { 1806 while (pipe->nrbufs >= pipe->buffers) {
1748 if (!pipe->readers) { 1807 if (!pipe->readers) {
1749 send_sig(SIGPIPE, current, 0); 1808 send_sig(SIGPIPE, current, 0);
1750 ret = -EPIPE; 1809 ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
1810 * Cannot make any progress, because either the input 1869 * Cannot make any progress, because either the input
1811 * pipe is empty or the output pipe is full. 1870 * pipe is empty or the output pipe is full.
1812 */ 1871 */
1813 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { 1872 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1814 /* Already processed some buffers, break */ 1873 /* Already processed some buffers, break */
1815 if (ret) 1874 if (ret)
1816 break; 1875 break;
@@ -1831,7 +1890,7 @@ retry:
1831 } 1890 }
1832 1891
1833 ibuf = ipipe->bufs + ipipe->curbuf; 1892 ibuf = ipipe->bufs + ipipe->curbuf;
1834 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; 1893 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1835 obuf = opipe->bufs + nbuf; 1894 obuf = opipe->bufs + nbuf;
1836 1895
1837 if (len >= ibuf->len) { 1896 if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
1841 *obuf = *ibuf; 1900 *obuf = *ibuf;
1842 ibuf->ops = NULL; 1901 ibuf->ops = NULL;
1843 opipe->nrbufs++; 1902 opipe->nrbufs++;
1844 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; 1903 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1845 ipipe->nrbufs--; 1904 ipipe->nrbufs--;
1846 input_wakeup = true; 1905 input_wakeup = true;
1847 } else { 1906 } else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1914 * If we have iterated all input buffers or ran out of 1973 * If we have iterated all input buffers or ran out of
1915 * output room, break. 1974 * output room, break.
1916 */ 1975 */
1917 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1976 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1918 break; 1977 break;
1919 1978
1920 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1979 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1921 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1980 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1922 1981
1923 /* 1982 /*
1924 * Get a reference to this pipe buffer, 1983 * Get a reference to this pipe buffer,
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..de6a44192832 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
42 if (wait) 42 if (wait)
43 sync_inodes_sb(sb); 43 sync_inodes_sb(sb);
44 else 44 else
45 writeback_inodes_sb(sb); 45 writeback_inodes_sb_locked(sb);
46 46
47 if (sb->s_op->sync_fs) 47 if (sb->s_op->sync_fs)
48 sb->s_op->sync_fs(sb, wait); 48 sb->s_op->sync_fs(sb, wait);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e9002513e08f..f24dbe5efde3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -725,7 +725,8 @@ void
725xfs_blkdev_issue_flush( 725xfs_blkdev_issue_flush(
726 xfs_buftarg_t *buftarg) 726 xfs_buftarg_t *buftarg)
727{ 727{
728 blkdev_issue_flush(buftarg->bt_bdev, NULL); 728 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
729 BLKDEV_IFL_WAIT);
729} 730}
730 731
731STATIC void 732STATIC void
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index bd0e3c6f323f..e6e0cb5437e6 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/timer.h>
17#include <linux/writeback.h> 18#include <linux/writeback.h>
18#include <asm/atomic.h> 19#include <asm/atomic.h>
19 20
@@ -88,6 +89,8 @@ struct backing_dev_info {
88 89
89 struct device *dev; 90 struct device *dev;
90 91
92 struct timer_list laptop_mode_wb_timer;
93
91#ifdef CONFIG_DEBUG_FS 94#ifdef CONFIG_DEBUG_FS
92 struct dentry *debug_dir; 95 struct dentry *debug_dir;
93 struct dentry *debug_stats; 96 struct dentry *debug_stats;
@@ -103,9 +106,10 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
103void bdi_unregister(struct backing_dev_info *bdi); 106void bdi_unregister(struct backing_dev_info *bdi);
104int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); 107int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
105void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 108void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
106 long nr_pages); 109 long nr_pages, int sb_locked);
107int bdi_writeback_task(struct bdi_writeback *wb); 110int bdi_writeback_task(struct bdi_writeback *wb);
108int bdi_has_dirty_io(struct backing_dev_info *bdi); 111int bdi_has_dirty_io(struct backing_dev_info *bdi);
112void bdi_arm_supers_timer(void);
109 113
110extern spinlock_t bdi_lock; 114extern spinlock_t bdi_lock;
111extern struct list_head bdi_list; 115extern struct list_head bdi_list;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8bae7bb..be411c12ebbe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -186,15 +186,19 @@ struct request {
186 }; 186 };
187 187
188 /* 188 /*
189 * two pointers are available for the IO schedulers, if they need 189 * Three pointers are available for the IO schedulers, if they need
190 * more they have to dynamically allocate it. 190 * more they have to dynamically allocate it.
191 */ 191 */
192 void *elevator_private; 192 void *elevator_private;
193 void *elevator_private2; 193 void *elevator_private2;
194 void *elevator_private3;
194 195
195 struct gendisk *rq_disk; 196 struct gendisk *rq_disk;
196 unsigned long start_time; 197 unsigned long start_time;
197 198#ifdef CONFIG_BLK_CGROUP
199 unsigned long long start_time_ns;
200 unsigned long long io_start_time_ns; /* when passed to hardware */
201#endif
198 /* Number of scatter-gather DMA addr+len pairs after 202 /* Number of scatter-gather DMA addr+len pairs after
199 * physical address coalescing is performed. 203 * physical address coalescing is performed.
200 */ 204 */
@@ -917,7 +921,12 @@ extern void blk_abort_queue(struct request_queue *);
917 */ 921 */
918extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn, 922extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
919 spinlock_t *lock, int node_id); 923 spinlock_t *lock, int node_id);
924extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *,
925 request_fn_proc *,
926 spinlock_t *, int node_id);
920extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *); 927extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
928extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
929 request_fn_proc *, spinlock_t *);
921extern void blk_cleanup_queue(struct request_queue *); 930extern void blk_cleanup_queue(struct request_queue *);
922extern void blk_queue_make_request(struct request_queue *, make_request_fn *); 931extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
923extern void blk_queue_bounce_limit(struct request_queue *, u64); 932extern void blk_queue_bounce_limit(struct request_queue *, u64);
@@ -994,20 +1003,25 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
994 return NULL; 1003 return NULL;
995 return bqt->tag_index[tag]; 1004 return bqt->tag_index[tag];
996} 1005}
997 1006enum{
998extern int blkdev_issue_flush(struct block_device *, sector_t *); 1007 BLKDEV_WAIT, /* wait for completion */
999#define DISCARD_FL_WAIT 0x01 /* wait for completion */ 1008 BLKDEV_BARRIER, /*issue request with barrier */
1000#define DISCARD_FL_BARRIER 0x02 /* issue DISCARD_BARRIER request */ 1009};
1001extern int blkdev_issue_discard(struct block_device *, sector_t sector, 1010#define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT)
1002 sector_t nr_sects, gfp_t, int flags); 1011#define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER)
1003 1012extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
1013 unsigned long);
1014extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
1015 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
1016extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
1017 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
1004static inline int sb_issue_discard(struct super_block *sb, 1018static inline int sb_issue_discard(struct super_block *sb,
1005 sector_t block, sector_t nr_blocks) 1019 sector_t block, sector_t nr_blocks)
1006{ 1020{
1007 block <<= (sb->s_blocksize_bits - 9); 1021 block <<= (sb->s_blocksize_bits - 9);
1008 nr_blocks <<= (sb->s_blocksize_bits - 9); 1022 nr_blocks <<= (sb->s_blocksize_bits - 9);
1009 return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL, 1023 return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
1010 DISCARD_FL_BARRIER); 1024 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1011} 1025}
1012 1026
1013extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); 1027extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@ -1196,6 +1210,39 @@ static inline void put_dev_sector(Sector p)
1196struct work_struct; 1210struct work_struct;
1197int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1211int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
1198 1212
1213#ifdef CONFIG_BLK_CGROUP
1214static inline void set_start_time_ns(struct request *req)
1215{
1216 req->start_time_ns = sched_clock();
1217}
1218
1219static inline void set_io_start_time_ns(struct request *req)
1220{
1221 req->io_start_time_ns = sched_clock();
1222}
1223
1224static inline uint64_t rq_start_time_ns(struct request *req)
1225{
1226 return req->start_time_ns;
1227}
1228
1229static inline uint64_t rq_io_start_time_ns(struct request *req)
1230{
1231 return req->io_start_time_ns;
1232}
1233#else
1234static inline void set_start_time_ns(struct request *req) {}
1235static inline void set_io_start_time_ns(struct request *req) {}
1236static inline uint64_t rq_start_time_ns(struct request *req)
1237{
1238 return 0;
1239}
1240static inline uint64_t rq_io_start_time_ns(struct request *req)
1241{
1242 return 0;
1243}
1244#endif
1245
1199#define MODULE_ALIAS_BLOCKDEV(major,minor) \ 1246#define MODULE_ALIAS_BLOCKDEV(major,minor) \
1200 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) 1247 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
1201#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ 1248#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
@@ -1283,8 +1330,7 @@ struct block_device_operations {
1283 int (*direct_access) (struct block_device *, sector_t, 1330 int (*direct_access) (struct block_device *, sector_t,
1284 void **, unsigned long *); 1331 void **, unsigned long *);
1285 int (*media_changed) (struct gendisk *); 1332 int (*media_changed) (struct gendisk *);
1286 unsigned long long (*set_capacity) (struct gendisk *, 1333 void (*unlock_native_capacity) (struct gendisk *);
1287 unsigned long long);
1288 int (*revalidate_disk) (struct gendisk *); 1334 int (*revalidate_disk) (struct gendisk *);
1289 int (*getgeo)(struct block_device *, struct hd_geometry *); 1335 int (*getgeo)(struct block_device *, struct hd_geometry *);
1290 struct module *owner; 1336 struct module *owner;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 4341b1a97a34..68530521ad00 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,10 +53,10 @@
53 53
54 54
55extern const char *drbd_buildtag(void); 55extern const char *drbd_buildtag(void);
56#define REL_VERSION "8.3.7" 56#define REL_VERSION "8.3.8rc1"
57#define API_VERSION 88 57#define API_VERSION 88
58#define PRO_VERSION_MIN 86 58#define PRO_VERSION_MIN 86
59#define PRO_VERSION_MAX 92 59#define PRO_VERSION_MAX 94
60 60
61 61
62enum drbd_io_error_p { 62enum drbd_io_error_p {
@@ -139,6 +139,7 @@ enum drbd_ret_codes {
139 ERR_DATA_NOT_CURRENT = 150, 139 ERR_DATA_NOT_CURRENT = 150,
140 ERR_CONNECTED = 151, /* DRBD 8.3 only */ 140 ERR_CONNECTED = 151, /* DRBD 8.3 only */
141 ERR_PERM = 152, 141 ERR_PERM = 152,
142 ERR_NEED_APV_93 = 153,
142 143
143 /* insert new ones above this line */ 144 /* insert new ones above this line */
144 AFTER_LAST_ERR_CODE 145 AFTER_LAST_ERR_CODE
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 51f47a586ad8..440b42e38e89 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -133,5 +133,21 @@
133#define DRBD_MAX_BIO_BVECS_MAX 128 133#define DRBD_MAX_BIO_BVECS_MAX 128
134#define DRBD_MAX_BIO_BVECS_DEF 0 134#define DRBD_MAX_BIO_BVECS_DEF 0
135 135
136#define DRBD_DP_VOLUME_MIN 4
137#define DRBD_DP_VOLUME_MAX 1048576
138#define DRBD_DP_VOLUME_DEF 16384
139
140#define DRBD_DP_INTERVAL_MIN 1
141#define DRBD_DP_INTERVAL_MAX 600
142#define DRBD_DP_INTERVAL_DEF 5
143
144#define DRBD_RS_THROTTLE_TH_MIN 1
145#define DRBD_RS_THROTTLE_TH_MAX 600
146#define DRBD_RS_THROTTLE_TH_DEF 20
147
148#define DRBD_RS_HOLD_OFF_TH_MIN 1
149#define DRBD_RS_HOLD_OFF_TH_MAX 6000
150#define DRBD_RS_HOLD_OFF_TH_DEF 100
151
136#undef RANGE 152#undef RANGE
137#endif 153#endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index f7431a4ca608..ce77a746fc9d 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -71,12 +71,17 @@ NL_PACKET(disconnect, 6, )
71NL_PACKET(resize, 7, 71NL_PACKET(resize, 7,
72 NL_INT64( 29, T_MAY_IGNORE, resize_size) 72 NL_INT64( 29, T_MAY_IGNORE, resize_size)
73 NL_BIT( 68, T_MAY_IGNORE, resize_force) 73 NL_BIT( 68, T_MAY_IGNORE, resize_force)
74 NL_BIT( 69, T_MANDATORY, no_resync)
74) 75)
75 76
76NL_PACKET(syncer_conf, 8, 77NL_PACKET(syncer_conf, 8,
77 NL_INTEGER( 30, T_MAY_IGNORE, rate) 78 NL_INTEGER( 30, T_MAY_IGNORE, rate)
78 NL_INTEGER( 31, T_MAY_IGNORE, after) 79 NL_INTEGER( 31, T_MAY_IGNORE, after)
79 NL_INTEGER( 32, T_MAY_IGNORE, al_extents) 80 NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
81 NL_INTEGER( 71, T_MAY_IGNORE, dp_volume)
82 NL_INTEGER( 72, T_MAY_IGNORE, dp_interval)
83 NL_INTEGER( 73, T_MAY_IGNORE, throttle_th)
84 NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th)
80 NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) 85 NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
81 NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) 86 NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
82 NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) 87 NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1cb3372e65d8..2c958f4fce1e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int
14 14
15typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *); 15typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
16 16
17typedef void (elevator_bio_merged_fn) (struct request_queue *,
18 struct request *, struct bio *);
19
17typedef int (elevator_dispatch_fn) (struct request_queue *, int); 20typedef int (elevator_dispatch_fn) (struct request_queue *, int);
18 21
19typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); 22typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
@@ -36,6 +39,7 @@ struct elevator_ops
36 elevator_merged_fn *elevator_merged_fn; 39 elevator_merged_fn *elevator_merged_fn;
37 elevator_merge_req_fn *elevator_merge_req_fn; 40 elevator_merge_req_fn *elevator_merge_req_fn;
38 elevator_allow_merge_fn *elevator_allow_merge_fn; 41 elevator_allow_merge_fn *elevator_allow_merge_fn;
42 elevator_bio_merged_fn *elevator_bio_merged_fn;
39 43
40 elevator_dispatch_fn *elevator_dispatch_fn; 44 elevator_dispatch_fn *elevator_dispatch_fn;
41 elevator_add_req_fn *elevator_add_req_fn; 45 elevator_add_req_fn *elevator_add_req_fn;
@@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
103extern void elv_merge_requests(struct request_queue *, struct request *, 107extern void elv_merge_requests(struct request_queue *, struct request *,
104 struct request *); 108 struct request *);
105extern void elv_merged_request(struct request_queue *, struct request *, int); 109extern void elv_merged_request(struct request_queue *, struct request *, int);
110extern void elv_bio_merged(struct request_queue *q, struct request *,
111 struct bio *);
106extern void elv_requeue_request(struct request_queue *, struct request *); 112extern void elv_requeue_request(struct request_queue *, struct request *);
107extern int elv_queue_empty(struct request_queue *); 113extern int elv_queue_empty(struct request_queue *);
108extern struct request *elv_former_request(struct request_queue *, struct request *); 114extern struct request *elv_former_request(struct request_queue *, struct request *);
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 86037400a6e3..afc00af3229b 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -22,6 +22,12 @@
22#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) 22#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2)
23 23
24/* 24/*
25 * Set and get of pipe page size array
26 */
27#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
28#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
29
30/*
25 * Types of directory notifications that may be requested. 31 * Types of directory notifications that may be requested.
26 */ 32 */
27#define DN_ACCESS 0x00000001 /* File accessed */ 33#define DN_ACCESS 0x00000001 /* File accessed */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4079ef99900f..1775d362732d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -651,6 +651,7 @@ struct block_device {
651 int bd_openers; 651 int bd_openers;
652 struct mutex bd_mutex; /* open/close mutex */ 652 struct mutex bd_mutex; /* open/close mutex */
653 struct list_head bd_inodes; 653 struct list_head bd_inodes;
654 void * bd_claiming;
654 void * bd_holder; 655 void * bd_holder;
655 int bd_holders; 656 int bd_holders;
656#ifdef CONFIG_SYSFS 657#ifdef CONFIG_SYSFS
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 3239d1c10acb..b6d448048ae2 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,7 +362,7 @@ struct ide_drive_s;
362struct ide_disk_ops { 362struct ide_disk_ops {
363 int (*check)(struct ide_drive_s *, const char *); 363 int (*check)(struct ide_drive_s *, const char *);
364 int (*get_capacity)(struct ide_drive_s *); 364 int (*get_capacity)(struct ide_drive_s *);
365 u64 (*set_capacity)(struct ide_drive_s *, u64); 365 void (*unlock_native_capacity)(struct ide_drive_s *);
366 void (*setup)(struct ide_drive_s *); 366 void (*setup)(struct ide_drive_s *);
367 void (*flush)(struct ide_drive_s *); 367 void (*flush)(struct ide_drive_s *);
368 int (*init_media)(struct ide_drive_s *, struct gendisk *); 368 int (*init_media)(struct ide_drive_s *, struct gendisk *);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b43a9e039059..16de3933c45e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -3,7 +3,7 @@
3 3
4#define PIPEFS_MAGIC 0x50495045 4#define PIPEFS_MAGIC 0x50495045
5 5
6#define PIPE_BUFFERS (16) 6#define PIPE_DEF_BUFFERS 16
7 7
8#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ 8#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
9#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ 9#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */
@@ -44,17 +44,17 @@ struct pipe_buffer {
44 **/ 44 **/
45struct pipe_inode_info { 45struct pipe_inode_info {
46 wait_queue_head_t wait; 46 wait_queue_head_t wait;
47 unsigned int nrbufs, curbuf; 47 unsigned int nrbufs, curbuf, buffers;
48 struct page *tmp_page;
49 unsigned int readers; 48 unsigned int readers;
50 unsigned int writers; 49 unsigned int writers;
51 unsigned int waiting_writers; 50 unsigned int waiting_writers;
52 unsigned int r_counter; 51 unsigned int r_counter;
53 unsigned int w_counter; 52 unsigned int w_counter;
53 struct page *tmp_page;
54 struct fasync_struct *fasync_readers; 54 struct fasync_struct *fasync_readers;
55 struct fasync_struct *fasync_writers; 55 struct fasync_struct *fasync_writers;
56 struct inode *inode; 56 struct inode *inode;
57 struct pipe_buffer bufs[PIPE_BUFFERS]; 57 struct pipe_buffer *bufs;
58}; 58};
59 59
60/* 60/*
@@ -139,6 +139,8 @@ void pipe_lock(struct pipe_inode_info *);
139void pipe_unlock(struct pipe_inode_info *); 139void pipe_unlock(struct pipe_inode_info *);
140void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); 140void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
141 141
142extern unsigned int pipe_max_pages;
143
142/* Drop the inode semaphore and wait for a pipe event, atomically */ 144/* Drop the inode semaphore and wait for a pipe event, atomically */
143void pipe_wait(struct pipe_inode_info *pipe); 145void pipe_wait(struct pipe_inode_info *pipe);
144 146
@@ -154,4 +156,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
154int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); 156int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
155void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); 157void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
156 158
159/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
160long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
161
157#endif 162#endif
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 18e7c7c0cae6..997c3b4c212b 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
82extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, 82extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
83 splice_direct_actor *); 83 splice_direct_actor *);
84 84
85/*
86 * for dynamic pipe sizing
87 */
88extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
89extern void splice_shrink_spd(struct pipe_inode_info *,
90 struct splice_pipe_desc *);
91
85#endif 92#endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 36520ded3e06..cc97d6caf2b3 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -65,6 +65,15 @@ struct writeback_control {
65 * so we use a single control to update them 65 * so we use a single control to update them
66 */ 66 */
67 unsigned no_nrwrite_index_update:1; 67 unsigned no_nrwrite_index_update:1;
68
69 /*
70 * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
71 * the writeback code will pin the sb for the caller. However,
72 * for eg umount, the caller does WB_SYNC_NONE but already has
73 * the sb pinned. If the below is set, caller already has the
74 * sb pinned.
75 */
76 unsigned sb_pinned:1;
68}; 77};
69 78
70/* 79/*
@@ -73,6 +82,7 @@ struct writeback_control {
73struct bdi_writeback; 82struct bdi_writeback;
74int inode_wait(void *); 83int inode_wait(void *);
75void writeback_inodes_sb(struct super_block *); 84void writeback_inodes_sb(struct super_block *);
85void writeback_inodes_sb_locked(struct super_block *);
76int writeback_inodes_sb_if_idle(struct super_block *); 86int writeback_inodes_sb_if_idle(struct super_block *);
77void sync_inodes_sb(struct super_block *); 87void sync_inodes_sb(struct super_block *);
78void writeback_inodes_wbc(struct writeback_control *wbc); 88void writeback_inodes_wbc(struct writeback_control *wbc);
@@ -96,8 +106,14 @@ static inline void inode_sync_wait(struct inode *inode)
96/* 106/*
97 * mm/page-writeback.c 107 * mm/page-writeback.c
98 */ 108 */
99void laptop_io_completion(void); 109#ifdef CONFIG_BLOCK
110void laptop_io_completion(struct backing_dev_info *info);
100void laptop_sync_completion(void); 111void laptop_sync_completion(void);
112void laptop_mode_sync(struct work_struct *work);
113void laptop_mode_timer_fn(unsigned long data);
114#else
115static inline void laptop_sync_completion(void) { }
116#endif
101void throttle_vm_writeout(gfp_t gfp_mask); 117void throttle_vm_writeout(gfp_t gfp_mask);
102 118
103/* These are exported to sysctl. */ 119/* These are exported to sysctl. */
diff --git a/init/Kconfig b/init/Kconfig
index 5fe94b82e4c0..2cce9f343ad0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -611,6 +611,33 @@ config RT_GROUP_SCHED
611 611
612endif #CGROUP_SCHED 612endif #CGROUP_SCHED
613 613
614config BLK_CGROUP
615 tristate "Block IO controller"
616 depends on CGROUPS && BLOCK
617 default n
618 ---help---
619 Generic block IO controller cgroup interface. This is the common
620 cgroup interface which should be used by various IO controlling
621 policies.
622
623 Currently, CFQ IO scheduler uses it to recognize task groups and
624 control disk bandwidth allocation (proportional time slice allocation)
625 to such task groups.
626
627 This option only enables generic Block IO controller infrastructure.
628 One needs to also enable actual IO controlling logic in CFQ for it
629 to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
630
631 See Documentation/cgroups/blkio-controller.txt for more information.
632
633config DEBUG_BLK_CGROUP
634 bool "Enable Block IO controller debugging"
635 depends on BLK_CGROUP
636 default n
637 ---help---
638 Enable some debugging help. Currently it exports additional stat
639 files in a cgroup which can be useful for debugging.
640
614endif # CGROUPS 641endif # CGROUPS
615 642
616config MM_OWNER 643config MM_OWNER
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..4268287148c1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1231 size_t read_subbuf = read_start / subbuf_size; 1231 size_t read_subbuf = read_start / subbuf_size;
1232 size_t padding = rbuf->padding[read_subbuf]; 1232 size_t padding = rbuf->padding[read_subbuf];
1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; 1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1234 struct page *pages[PIPE_BUFFERS]; 1234 struct page *pages[PIPE_DEF_BUFFERS];
1235 struct partial_page partial[PIPE_BUFFERS]; 1235 struct partial_page partial[PIPE_DEF_BUFFERS];
1236 struct splice_pipe_desc spd = { 1236 struct splice_pipe_desc spd = {
1237 .pages = pages, 1237 .pages = pages,
1238 .nr_pages = 0, 1238 .nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1245 1245
1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1247 return 0; 1247 return 0;
1248 if (splice_grow_spd(pipe, &spd))
1249 return -ENOMEM;
1248 1250
1249 /* 1251 /*
1250 * Adjust read len, if longer than what is available 1252 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
1255 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1257 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1256 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1258 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1257 poff = read_start & ~PAGE_MASK; 1259 poff = read_start & ~PAGE_MASK;
1258 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); 1260 nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
1259 1261
1260 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { 1262 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1261 unsigned int this_len, this_end, private; 1263 unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
1289 } 1291 }
1290 } 1292 }
1291 1293
1294 ret = 0;
1292 if (!spd.nr_pages) 1295 if (!spd.nr_pages)
1293 return 0; 1296 goto out;
1294 1297
1295 ret = *nonpad_ret = splice_to_pipe(pipe, &spd); 1298 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1296 if (ret < 0 || ret < total_len) 1299 if (ret < 0 || ret < total_len)
1297 return ret; 1300 goto out;
1298 1301
1299 if (read_start + ret == nonpad_end) 1302 if (read_start + ret == nonpad_end)
1300 ret += padding; 1303 ret += padding;
1301 1304
1305out:
1306 splice_shrink_spd(pipe, &spd);
1302 return ret; 1307 return ret;
1303} 1308}
1304 1309
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
41 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ); 42 * (NSEC_PER_SEC / HZ);
43} 43}
44EXPORT_SYMBOL_GPL(sched_clock);
44 45
45static __read_mostly int sched_clock_running; 46static __read_mostly int sched_clock_running;
46 47
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b12583047757..18821e77b2a0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,7 @@
52#include <linux/slow-work.h> 52#include <linux/slow-work.h>
53#include <linux/perf_event.h> 53#include <linux/perf_event.h>
54#include <linux/kprobes.h> 54#include <linux/kprobes.h>
55#include <linux/pipe_fs_i.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
57#include <asm/processor.h> 58#include <asm/processor.h>
@@ -1444,6 +1445,14 @@ static struct ctl_table fs_table[] = {
1444 .child = binfmt_misc_table, 1445 .child = binfmt_misc_table,
1445 }, 1446 },
1446#endif 1447#endif
1448 {
1449 .procname = "pipe-max-pages",
1450 .data = &pipe_max_pages,
1451 .maxlen = sizeof(int),
1452 .mode = 0644,
1453 .proc_handler = &proc_dointvec_minmax,
1454 .extra1 = &two,
1455 },
1447/* 1456/*
1448 * NOTE: do not add new entries to this table unless you have read 1457 * NOTE: do not add new entries to this table unless you have read
1449 * Documentation/sysctl/ctl_unnumbered.txt 1458 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 756d7283318b..8a76339a9e65 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3309,12 +3309,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3309 size_t len, 3309 size_t len,
3310 unsigned int flags) 3310 unsigned int flags)
3311{ 3311{
3312 struct page *pages[PIPE_BUFFERS]; 3312 struct page *pages_def[PIPE_DEF_BUFFERS];
3313 struct partial_page partial[PIPE_BUFFERS]; 3313 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3314 struct trace_iterator *iter = filp->private_data; 3314 struct trace_iterator *iter = filp->private_data;
3315 struct splice_pipe_desc spd = { 3315 struct splice_pipe_desc spd = {
3316 .pages = pages, 3316 .pages = pages_def,
3317 .partial = partial, 3317 .partial = partial_def,
3318 .nr_pages = 0, /* This gets updated below. */ 3318 .nr_pages = 0, /* This gets updated below. */
3319 .flags = flags, 3319 .flags = flags,
3320 .ops = &tracing_pipe_buf_ops, 3320 .ops = &tracing_pipe_buf_ops,
@@ -3325,6 +3325,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3325 size_t rem; 3325 size_t rem;
3326 unsigned int i; 3326 unsigned int i;
3327 3327
3328 if (splice_grow_spd(pipe, &spd))
3329 return -ENOMEM;
3330
3328 /* copy the tracer to avoid using a global lock all around */ 3331 /* copy the tracer to avoid using a global lock all around */
3329 mutex_lock(&trace_types_lock); 3332 mutex_lock(&trace_types_lock);
3330 if (unlikely(old_tracer != current_trace && current_trace)) { 3333 if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3355,23 +3358,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3355 trace_access_lock(iter->cpu_file); 3358 trace_access_lock(iter->cpu_file);
3356 3359
3357 /* Fill as many pages as possible. */ 3360 /* Fill as many pages as possible. */
3358 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3361 for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
3359 pages[i] = alloc_page(GFP_KERNEL); 3362 spd.pages[i] = alloc_page(GFP_KERNEL);
3360 if (!pages[i]) 3363 if (!spd.pages[i])
3361 break; 3364 break;
3362 3365
3363 rem = tracing_fill_pipe_page(rem, iter); 3366 rem = tracing_fill_pipe_page(rem, iter);
3364 3367
3365 /* Copy the data into the page, so we can start over. */ 3368 /* Copy the data into the page, so we can start over. */
3366 ret = trace_seq_to_buffer(&iter->seq, 3369 ret = trace_seq_to_buffer(&iter->seq,
3367 page_address(pages[i]), 3370 page_address(spd.pages[i]),
3368 iter->seq.len); 3371 iter->seq.len);
3369 if (ret < 0) { 3372 if (ret < 0) {
3370 __free_page(pages[i]); 3373 __free_page(spd.pages[i]);
3371 break; 3374 break;
3372 } 3375 }
3373 partial[i].offset = 0; 3376 spd.partial[i].offset = 0;
3374 partial[i].len = iter->seq.len; 3377 spd.partial[i].len = iter->seq.len;
3375 3378
3376 trace_seq_init(&iter->seq); 3379 trace_seq_init(&iter->seq);
3377 } 3380 }
@@ -3382,12 +3385,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3382 3385
3383 spd.nr_pages = i; 3386 spd.nr_pages = i;
3384 3387
3385 return splice_to_pipe(pipe, &spd); 3388 ret = splice_to_pipe(pipe, &spd);
3389out:
3390 splice_shrink_spd(pipe, &spd);
3391 return ret;
3386 3392
3387out_err: 3393out_err:
3388 mutex_unlock(&iter->mutex); 3394 mutex_unlock(&iter->mutex);
3389 3395 goto out;
3390 return ret;
3391} 3396}
3392 3397
3393static ssize_t 3398static ssize_t
@@ -3786,11 +3791,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3786 unsigned int flags) 3791 unsigned int flags)
3787{ 3792{
3788 struct ftrace_buffer_info *info = file->private_data; 3793 struct ftrace_buffer_info *info = file->private_data;
3789 struct partial_page partial[PIPE_BUFFERS]; 3794 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3790 struct page *pages[PIPE_BUFFERS]; 3795 struct page *pages_def[PIPE_DEF_BUFFERS];
3791 struct splice_pipe_desc spd = { 3796 struct splice_pipe_desc spd = {
3792 .pages = pages, 3797 .pages = pages_def,
3793 .partial = partial, 3798 .partial = partial_def,
3794 .flags = flags, 3799 .flags = flags,
3795 .ops = &buffer_pipe_buf_ops, 3800 .ops = &buffer_pipe_buf_ops,
3796 .spd_release = buffer_spd_release, 3801 .spd_release = buffer_spd_release,
@@ -3799,22 +3804,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3799 int entries, size, i; 3804 int entries, size, i;
3800 size_t ret; 3805 size_t ret;
3801 3806
3807 if (splice_grow_spd(pipe, &spd))
3808 return -ENOMEM;
3809
3802 if (*ppos & (PAGE_SIZE - 1)) { 3810 if (*ppos & (PAGE_SIZE - 1)) {
3803 WARN_ONCE(1, "Ftrace: previous read must page-align\n"); 3811 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3804 return -EINVAL; 3812 ret = -EINVAL;
3813 goto out;
3805 } 3814 }
3806 3815
3807 if (len & (PAGE_SIZE - 1)) { 3816 if (len & (PAGE_SIZE - 1)) {
3808 WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); 3817 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3809 if (len < PAGE_SIZE) 3818 if (len < PAGE_SIZE) {
3810 return -EINVAL; 3819 ret = -EINVAL;
3820 goto out;
3821 }
3811 len &= PAGE_MASK; 3822 len &= PAGE_MASK;
3812 } 3823 }
3813 3824
3814 trace_access_lock(info->cpu); 3825 trace_access_lock(info->cpu);
3815 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3826 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3816 3827
3817 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3828 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
3818 struct page *page; 3829 struct page *page;
3819 int r; 3830 int r;
3820 3831
@@ -3869,11 +3880,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3869 else 3880 else
3870 ret = 0; 3881 ret = 0;
3871 /* TODO: block */ 3882 /* TODO: block */
3872 return ret; 3883 goto out;
3873 } 3884 }
3874 3885
3875 ret = splice_to_pipe(pipe, &spd); 3886 ret = splice_to_pipe(pipe, &spd);
3876 3887 splice_shrink_spd(pipe, &spd);
3888out:
3877 return ret; 3889 return ret;
3878} 3890}
3879 3891
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 707d0dc6da0f..660a87a22511 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -48,7 +48,6 @@ static struct timer_list sync_supers_timer;
48 48
49static int bdi_sync_supers(void *); 49static int bdi_sync_supers(void *);
50static void sync_supers_timer_fn(unsigned long); 50static void sync_supers_timer_fn(unsigned long);
51static void arm_supers_timer(void);
52 51
53static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); 52static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
54 53
@@ -252,7 +251,7 @@ static int __init default_bdi_init(void)
252 251
253 init_timer(&sync_supers_timer); 252 init_timer(&sync_supers_timer);
254 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); 253 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
255 arm_supers_timer(); 254 bdi_arm_supers_timer();
256 255
257 err = bdi_init(&default_backing_dev_info); 256 err = bdi_init(&default_backing_dev_info);
258 if (!err) 257 if (!err)
@@ -374,10 +373,13 @@ static int bdi_sync_supers(void *unused)
374 return 0; 373 return 0;
375} 374}
376 375
377static void arm_supers_timer(void) 376void bdi_arm_supers_timer(void)
378{ 377{
379 unsigned long next; 378 unsigned long next;
380 379
380 if (!dirty_writeback_interval)
381 return;
382
381 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; 383 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
382 mod_timer(&sync_supers_timer, round_jiffies_up(next)); 384 mod_timer(&sync_supers_timer, round_jiffies_up(next));
383} 385}
@@ -385,7 +387,7 @@ static void arm_supers_timer(void)
385static void sync_supers_timer_fn(unsigned long unused) 387static void sync_supers_timer_fn(unsigned long unused)
386{ 388{
387 wake_up_process(sync_supers_tsk); 389 wake_up_process(sync_supers_tsk);
388 arm_supers_timer(); 390 bdi_arm_supers_timer();
389} 391}
390 392
391static int bdi_forker_task(void *ptr) 393static int bdi_forker_task(void *ptr)
@@ -428,7 +430,10 @@ static int bdi_forker_task(void *ptr)
428 430
429 spin_unlock_bh(&bdi_lock); 431 spin_unlock_bh(&bdi_lock);
430 wait = msecs_to_jiffies(dirty_writeback_interval * 10); 432 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
431 schedule_timeout(wait); 433 if (wait)
434 schedule_timeout(wait);
435 else
436 schedule();
432 try_to_freeze(); 437 try_to_freeze();
433 continue; 438 continue;
434 } 439 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8b..b289310e2c89 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS)) 598 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 599 > background_thresh)))
600 bdi_start_writeback(bdi, NULL, 0); 600 bdi_start_writeback(bdi, NULL, 0, 0);
601} 601}
602 602
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 603void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
683 } 683 }
684} 684}
685 685
686static void laptop_timer_fn(unsigned long unused);
687
688static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
689
690/* 686/*
691 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 687 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
692 */ 688 */
@@ -694,24 +690,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
694 void __user *buffer, size_t *length, loff_t *ppos) 690 void __user *buffer, size_t *length, loff_t *ppos)
695{ 691{
696 proc_dointvec(table, write, buffer, length, ppos); 692 proc_dointvec(table, write, buffer, length, ppos);
693 bdi_arm_supers_timer();
697 return 0; 694 return 0;
698} 695}
699 696
700static void do_laptop_sync(struct work_struct *work) 697#ifdef CONFIG_BLOCK
698void laptop_mode_timer_fn(unsigned long data)
701{ 699{
702 wakeup_flusher_threads(0); 700 struct request_queue *q = (struct request_queue *)data;
703 kfree(work); 701 int nr_pages = global_page_state(NR_FILE_DIRTY) +
704} 702 global_page_state(NR_UNSTABLE_NFS);
705 703
706static void laptop_timer_fn(unsigned long unused) 704 /*
707{ 705 * We want to write everything out, not just down to the dirty
708 struct work_struct *work; 706 * threshold
707 */
709 708
710 work = kmalloc(sizeof(*work), GFP_ATOMIC); 709 if (bdi_has_dirty_io(&q->backing_dev_info))
711 if (work) { 710 bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
712 INIT_WORK(work, do_laptop_sync);
713 schedule_work(work);
714 }
715} 711}
716 712
717/* 713/*
@@ -719,9 +715,9 @@ static void laptop_timer_fn(unsigned long unused)
719 * of all dirty data a few seconds from now. If the flush is already scheduled 715 * of all dirty data a few seconds from now. If the flush is already scheduled
720 * then push it back - the user is still using the disk. 716 * then push it back - the user is still using the disk.
721 */ 717 */
722void laptop_io_completion(void) 718void laptop_io_completion(struct backing_dev_info *info)
723{ 719{
724 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); 720 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
725} 721}
726 722
727/* 723/*
@@ -731,8 +727,16 @@ void laptop_io_completion(void)
731 */ 727 */
732void laptop_sync_completion(void) 728void laptop_sync_completion(void)
733{ 729{
734 del_timer(&laptop_mode_wb_timer); 730 struct backing_dev_info *bdi;
731
732 rcu_read_lock();
733
734 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
735 del_timer(&bdi->laptop_mode_wb_timer);
736
737 rcu_read_unlock();
735} 738}
739#endif
736 740
737/* 741/*
738 * If ratelimit_pages is too high then we can get into dirty-data overload 742 * If ratelimit_pages is too high then we can get into dirty-data overload
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc7..eb086e0f4dcc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 142 nr_blocks, GFP_KERNEL,
143 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
143 if (err) 144 if (err)
144 return err; 145 return err;
145 cond_resched(); 146 cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 151 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151 152
152 err = blkdev_issue_discard(si->bdev, start_block, 153 err = blkdev_issue_discard(si->bdev, start_block,
153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 154 nr_blocks, GFP_KERNEL,
155 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
154 if (err) 156 if (err)
155 break; 157 break;
156 158
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
189 start_block <<= PAGE_SHIFT - 9; 191 start_block <<= PAGE_SHIFT - 9;
190 nr_blocks <<= PAGE_SHIFT - 9; 192 nr_blocks <<= PAGE_SHIFT - 9;
191 if (blkdev_issue_discard(si->bdev, start_block, 193 if (blkdev_issue_discard(si->bdev, start_block,
192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) 194 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
195 BLKDEV_IFL_BARRIER))
193 break; 196 break;
194 } 197 }
195 198
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c543dd252433..66d9c416851e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1406,12 +1406,13 @@ new_page:
1406/* 1406/*
1407 * Fill page/offset/length into spd, if it can hold more pages. 1407 * Fill page/offset/length into spd, if it can hold more pages.
1408 */ 1408 */
1409static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, 1409static inline int spd_fill_page(struct splice_pipe_desc *spd,
1410 struct pipe_inode_info *pipe, struct page *page,
1410 unsigned int *len, unsigned int offset, 1411 unsigned int *len, unsigned int offset,
1411 struct sk_buff *skb, int linear, 1412 struct sk_buff *skb, int linear,
1412 struct sock *sk) 1413 struct sock *sk)
1413{ 1414{
1414 if (unlikely(spd->nr_pages == PIPE_BUFFERS)) 1415 if (unlikely(spd->nr_pages == pipe->buffers))
1415 return 1; 1416 return 1;
1416 1417
1417 if (linear) { 1418 if (linear) {
@@ -1447,7 +1448,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
1447 unsigned int plen, unsigned int *off, 1448 unsigned int plen, unsigned int *off,
1448 unsigned int *len, struct sk_buff *skb, 1449 unsigned int *len, struct sk_buff *skb,
1449 struct splice_pipe_desc *spd, int linear, 1450 struct splice_pipe_desc *spd, int linear,
1450 struct sock *sk) 1451 struct sock *sk,
1452 struct pipe_inode_info *pipe)
1451{ 1453{
1452 if (!*len) 1454 if (!*len)
1453 return 1; 1455 return 1;
@@ -1470,7 +1472,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
1470 /* the linear region may spread across several pages */ 1472 /* the linear region may spread across several pages */
1471 flen = min_t(unsigned int, flen, PAGE_SIZE - poff); 1473 flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
1472 1474
1473 if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) 1475 if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
1474 return 1; 1476 return 1;
1475 1477
1476 __segment_seek(&page, &poff, &plen, flen); 1478 __segment_seek(&page, &poff, &plen, flen);
@@ -1485,9 +1487,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
1485 * Map linear and fragment data from the skb to spd. It reports failure if the 1487 * Map linear and fragment data from the skb to spd. It reports failure if the
1486 * pipe is full or if we already spliced the requested length. 1488 * pipe is full or if we already spliced the requested length.
1487 */ 1489 */
1488static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, 1490static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
1489 unsigned int *len, struct splice_pipe_desc *spd, 1491 unsigned int *offset, unsigned int *len,
1490 struct sock *sk) 1492 struct splice_pipe_desc *spd, struct sock *sk)
1491{ 1493{
1492 int seg; 1494 int seg;
1493 1495
@@ -1497,7 +1499,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
1497 if (__splice_segment(virt_to_page(skb->data), 1499 if (__splice_segment(virt_to_page(skb->data),
1498 (unsigned long) skb->data & (PAGE_SIZE - 1), 1500 (unsigned long) skb->data & (PAGE_SIZE - 1),
1499 skb_headlen(skb), 1501 skb_headlen(skb),
1500 offset, len, skb, spd, 1, sk)) 1502 offset, len, skb, spd, 1, sk, pipe))
1501 return 1; 1503 return 1;
1502 1504
1503 /* 1505 /*
@@ -1507,7 +1509,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
1507 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 1509 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
1508 1510
1509 if (__splice_segment(f->page, f->page_offset, f->size, 1511 if (__splice_segment(f->page, f->page_offset, f->size,
1510 offset, len, skb, spd, 0, sk)) 1512 offset, len, skb, spd, 0, sk, pipe))
1511 return 1; 1513 return 1;
1512 } 1514 }
1513 1515
@@ -1524,8 +1526,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
1524 struct pipe_inode_info *pipe, unsigned int tlen, 1526 struct pipe_inode_info *pipe, unsigned int tlen,
1525 unsigned int flags) 1527 unsigned int flags)
1526{ 1528{
1527 struct partial_page partial[PIPE_BUFFERS]; 1529 struct partial_page partial[PIPE_DEF_BUFFERS];
1528 struct page *pages[PIPE_BUFFERS]; 1530 struct page *pages[PIPE_DEF_BUFFERS];
1529 struct splice_pipe_desc spd = { 1531 struct splice_pipe_desc spd = {
1530 .pages = pages, 1532 .pages = pages,
1531 .partial = partial, 1533 .partial = partial,
@@ -1535,12 +1537,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
1535 }; 1537 };
1536 struct sk_buff *frag_iter; 1538 struct sk_buff *frag_iter;
1537 struct sock *sk = skb->sk; 1539 struct sock *sk = skb->sk;
1540 int ret = 0;
1541
1542 if (splice_grow_spd(pipe, &spd))
1543 return -ENOMEM;
1538 1544
1539 /* 1545 /*
1540 * __skb_splice_bits() only fails if the output has no room left, 1546 * __skb_splice_bits() only fails if the output has no room left,
1541 * so no point in going over the frag_list for the error case. 1547 * so no point in going over the frag_list for the error case.
1542 */ 1548 */
1543 if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) 1549 if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
1544 goto done; 1550 goto done;
1545 else if (!tlen) 1551 else if (!tlen)
1546 goto done; 1552 goto done;
@@ -1551,14 +1557,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
1551 skb_walk_frags(skb, frag_iter) { 1557 skb_walk_frags(skb, frag_iter) {
1552 if (!tlen) 1558 if (!tlen)
1553 break; 1559 break;
1554 if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk)) 1560 if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
1555 break; 1561 break;
1556 } 1562 }
1557 1563
1558done: 1564done:
1559 if (spd.nr_pages) { 1565 if (spd.nr_pages) {
1560 int ret;
1561
1562 /* 1566 /*
1563 * Drop the socket lock, otherwise we have reverse 1567 * Drop the socket lock, otherwise we have reverse
1564 * locking dependencies between sk_lock and i_mutex 1568 * locking dependencies between sk_lock and i_mutex
@@ -1571,10 +1575,10 @@ done:
1571 release_sock(sk); 1575 release_sock(sk);
1572 ret = splice_to_pipe(pipe, &spd); 1576 ret = splice_to_pipe(pipe, &spd);
1573 lock_sock(sk); 1577 lock_sock(sk);
1574 return ret;
1575 } 1578 }
1576 1579
1577 return 0; 1580 splice_shrink_spd(pipe, &spd);
1581 return ret;
1578} 1582}
1579 1583
1580/** 1584/**