diff options
93 files changed, 3423 insertions, 1241 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 630879cd9a42..48e0b21b0059 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
@@ -17,6 +17,9 @@ HOWTO | |||
17 | You can do a very simple testing of running two dd threads in two different | 17 | You can do a very simple testing of running two dd threads in two different |
18 | cgroups. Here is what you can do. | 18 | cgroups. Here is what you can do. |
19 | 19 | ||
20 | - Enable Block IO controller | ||
21 | CONFIG_BLK_CGROUP=y | ||
22 | |||
20 | - Enable group scheduling in CFQ | 23 | - Enable group scheduling in CFQ |
21 | CONFIG_CFQ_GROUP_IOSCHED=y | 24 | CONFIG_CFQ_GROUP_IOSCHED=y |
22 | 25 | ||
@@ -54,32 +57,52 @@ cgroups. Here is what you can do. | |||
54 | 57 | ||
55 | Various user visible config options | 58 | Various user visible config options |
56 | =================================== | 59 | =================================== |
57 | CONFIG_CFQ_GROUP_IOSCHED | ||
58 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
59 | creation is allowed. | ||
60 | |||
61 | CONFIG_DEBUG_CFQ_IOSCHED | ||
62 | - Enables some debugging messages in blktrace. Also creates extra | ||
63 | cgroup file blkio.dequeue. | ||
64 | |||
65 | Config options selected automatically | ||
66 | ===================================== | ||
67 | These config options are not user visible and are selected/deselected | ||
68 | automatically based on IO scheduler configuration. | ||
69 | |||
70 | CONFIG_BLK_CGROUP | 60 | CONFIG_BLK_CGROUP |
71 | - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. | 61 | - Block IO controller. |
72 | 62 | ||
73 | CONFIG_DEBUG_BLK_CGROUP | 63 | CONFIG_DEBUG_BLK_CGROUP |
74 | - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. | 64 | - Debug help. Right now some additional stats file show up in cgroup |
65 | if this option is enabled. | ||
66 | |||
67 | CONFIG_CFQ_GROUP_IOSCHED | ||
68 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
69 | creation is allowed. | ||
75 | 70 | ||
76 | Details of cgroup files | 71 | Details of cgroup files |
77 | ======================= | 72 | ======================= |
78 | - blkio.weight | 73 | - blkio.weight |
79 | - Specifies per cgroup weight. | 74 | - Specifies per cgroup weight. This is default weight of the group |
80 | 75 | on all the devices until and unless overridden by per device rule. | |
76 | (See blkio.weight_device). | ||
81 | Currently allowed range of weights is from 100 to 1000. | 77 | Currently allowed range of weights is from 100 to 1000. |
82 | 78 | ||
79 | - blkio.weight_device | ||
80 | - One can specify per cgroup per device rules using this interface. | ||
81 | These rules override the default value of group weight as specified | ||
82 | by blkio.weight. | ||
83 | |||
84 | Following is the format. | ||
85 | |||
86 | #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device | ||
87 | Configure weight=300 on /dev/sdb (8:16) in this cgroup | ||
88 | # echo 8:16 300 > blkio.weight_device | ||
89 | # cat blkio.weight_device | ||
90 | dev weight | ||
91 | 8:16 300 | ||
92 | |||
93 | Configure weight=500 on /dev/sda (8:0) in this cgroup | ||
94 | # echo 8:0 500 > blkio.weight_device | ||
95 | # cat blkio.weight_device | ||
96 | dev weight | ||
97 | 8:0 500 | ||
98 | 8:16 300 | ||
99 | |||
100 | Remove specific weight for /dev/sda in this cgroup | ||
101 | # echo 8:0 0 > blkio.weight_device | ||
102 | # cat blkio.weight_device | ||
103 | dev weight | ||
104 | 8:16 300 | ||
105 | |||
83 | - blkio.time | 106 | - blkio.time |
84 | - disk time allocated to cgroup per device in milliseconds. First | 107 | - disk time allocated to cgroup per device in milliseconds. First |
85 | two fields specify the major and minor number of the device and | 108 | two fields specify the major and minor number of the device and |
@@ -92,13 +115,105 @@ Details of cgroup files | |||
92 | third field specifies the number of sectors transferred by the | 115 | third field specifies the number of sectors transferred by the |
93 | group to/from the device. | 116 | group to/from the device. |
94 | 117 | ||
118 | - blkio.io_service_bytes | ||
119 | - Number of bytes transferred to/from the disk by the group. These | ||
120 | are further divided by the type of operation - read or write, sync | ||
121 | or async. First two fields specify the major and minor number of the | ||
122 | device, third field specifies the operation type and the fourth field | ||
123 | specifies the number of bytes. | ||
124 | |||
125 | - blkio.io_serviced | ||
126 | - Number of IOs completed to/from the disk by the group. These | ||
127 | are further divided by the type of operation - read or write, sync | ||
128 | or async. First two fields specify the major and minor number of the | ||
129 | device, third field specifies the operation type and the fourth field | ||
130 | specifies the number of IOs. | ||
131 | |||
132 | - blkio.io_service_time | ||
133 | - Total amount of time between request dispatch and request completion | ||
134 | for the IOs done by this cgroup. This is in nanoseconds to make it | ||
135 | meaningful for flash devices too. For devices with queue depth of 1, | ||
136 | this time represents the actual service time. When queue_depth > 1, | ||
137 | that is no longer true as requests may be served out of order. This | ||
138 | may cause the service time for a given IO to include the service time | ||
139 | of multiple IOs when served out of order which may result in total | ||
140 | io_service_time > actual time elapsed. This time is further divided by | ||
141 | the type of operation - read or write, sync or async. First two fields | ||
142 | specify the major and minor number of the device, third field | ||
143 | specifies the operation type and the fourth field specifies the | ||
144 | io_service_time in ns. | ||
145 | |||
146 | - blkio.io_wait_time | ||
147 | - Total amount of time the IOs for this cgroup spent waiting in the | ||
148 | scheduler queues for service. This can be greater than the total time | ||
149 | elapsed since it is cumulative io_wait_time for all IOs. It is not a | ||
150 | measure of total time the cgroup spent waiting but rather a measure of | ||
151 | the wait_time for its individual IOs. For devices with queue_depth > 1 | ||
152 | this metric does not include the time spent waiting for service once | ||
153 | the IO is dispatched to the device but till it actually gets serviced | ||
154 | (there might be a time lag here due to re-ordering of requests by the | ||
155 | device). This is in nanoseconds to make it meaningful for flash | ||
156 | devices too. This time is further divided by the type of operation - | ||
157 | read or write, sync or async. First two fields specify the major and | ||
158 | minor number of the device, third field specifies the operation type | ||
159 | and the fourth field specifies the io_wait_time in ns. | ||
160 | |||
161 | - blkio.io_merged | ||
162 | - Total number of bios/requests merged into requests belonging to this | ||
163 | cgroup. This is further divided by the type of operation - read or | ||
164 | write, sync or async. | ||
165 | |||
166 | - blkio.io_queued | ||
167 | - Total number of requests queued up at any given instant for this | ||
168 | cgroup. This is further divided by the type of operation - read or | ||
169 | write, sync or async. | ||
170 | |||
171 | - blkio.avg_queue_size | ||
172 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
173 | The average queue size for this cgroup over the entire time of this | ||
174 | cgroup's existence. Queue size samples are taken each time one of the | ||
175 | queues of this cgroup gets a timeslice. | ||
176 | |||
177 | - blkio.group_wait_time | ||
178 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
179 | This is the amount of time the cgroup had to wait since it became busy | ||
180 | (i.e., went from 0 to 1 request queued) to get a timeslice for one of | ||
181 | its queues. This is different from the io_wait_time which is the | ||
182 | cumulative total of the amount of time spent by each IO in that cgroup | ||
183 | waiting in the scheduler queue. This is in nanoseconds. If this is | ||
184 | read when the cgroup is in a waiting (for timeslice) state, the stat | ||
185 | will only report the group_wait_time accumulated till the last time it | ||
186 | got a timeslice and will not include the current delta. | ||
187 | |||
188 | - blkio.empty_time | ||
189 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
190 | This is the amount of time a cgroup spends without any pending | ||
191 | requests when not being served, i.e., it does not include any time | ||
192 | spent idling for one of the queues of the cgroup. This is in | ||
193 | nanoseconds. If this is read when the cgroup is in an empty state, | ||
194 | the stat will only report the empty_time accumulated till the last | ||
195 | time it had a pending request and will not include the current delta. | ||
196 | |||
197 | - blkio.idle_time | ||
198 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
199 | This is the amount of time spent by the IO scheduler idling for a | ||
200 | given cgroup in anticipation of a better request than the exising ones | ||
201 | from other queues/cgroups. This is in nanoseconds. If this is read | ||
202 | when the cgroup is in an idling state, the stat will only report the | ||
203 | idle_time accumulated till the last idle period and will not include | ||
204 | the current delta. | ||
205 | |||
95 | - blkio.dequeue | 206 | - blkio.dequeue |
96 | - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This | 207 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This |
97 | gives the statistics about how many a times a group was dequeued | 208 | gives the statistics about how many a times a group was dequeued |
98 | from service tree of the device. First two fields specify the major | 209 | from service tree of the device. First two fields specify the major |
99 | and minor number of the device and third field specifies the number | 210 | and minor number of the device and third field specifies the number |
100 | of times a group was dequeued from a particular device. | 211 | of times a group was dequeued from a particular device. |
101 | 212 | ||
213 | - blkio.reset_stats | ||
214 | - Writing an int to this file will result in resetting all the stats | ||
215 | for that cgroup. | ||
216 | |||
102 | CFQ sysfs tunable | 217 | CFQ sysfs tunable |
103 | ================= | 218 | ================= |
104 | /sys/block/<disk>/queue/iosched/group_isolation | 219 | /sys/block/<disk>/queue/iosched/group_isolation |
diff --git a/block/Kconfig b/block/Kconfig index f9e89f4d94bb..9be0b56eaee1 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY | |||
77 | T10/SCSI Data Integrity Field or the T13/ATA External Path | 77 | T10/SCSI Data Integrity Field or the T13/ATA External Path |
78 | Protection. If in doubt, say N. | 78 | Protection. If in doubt, say N. |
79 | 79 | ||
80 | config BLK_CGROUP | ||
81 | tristate "Block cgroup support" | ||
82 | depends on CGROUPS | ||
83 | depends on CFQ_GROUP_IOSCHED | ||
84 | default n | ||
85 | ---help--- | ||
86 | Generic block IO controller cgroup interface. This is the common | ||
87 | cgroup interface which should be used by various IO controlling | ||
88 | policies. | ||
89 | |||
90 | Currently, CFQ IO scheduler uses it to recognize task groups and | ||
91 | control disk bandwidth allocation (proportional time slice allocation) | ||
92 | to such task groups. | ||
93 | |||
94 | config DEBUG_BLK_CGROUP | ||
95 | bool | ||
96 | depends on BLK_CGROUP | ||
97 | default n | ||
98 | ---help--- | ||
99 | Enable some debugging help. Currently it stores the cgroup path | ||
100 | in the blk group which can be used by cfq for tracing various | ||
101 | group related activity. | ||
102 | |||
103 | endif # BLOCK | 80 | endif # BLOCK |
104 | 81 | ||
105 | config BLOCK_COMPAT | 82 | config BLOCK_COMPAT |
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index fc71cf071fb2..3199b76f795d 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE | |||
23 | 23 | ||
24 | config IOSCHED_CFQ | 24 | config IOSCHED_CFQ |
25 | tristate "CFQ I/O scheduler" | 25 | tristate "CFQ I/O scheduler" |
26 | select BLK_CGROUP if CFQ_GROUP_IOSCHED | 26 | # If BLK_CGROUP is a module, CFQ has to be built as module. |
27 | depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y | ||
27 | default y | 28 | default y |
28 | ---help--- | 29 | ---help--- |
29 | The CFQ I/O scheduler tries to distribute bandwidth equally | 30 | The CFQ I/O scheduler tries to distribute bandwidth equally |
@@ -33,22 +34,15 @@ config IOSCHED_CFQ | |||
33 | 34 | ||
34 | This is the default I/O scheduler. | 35 | This is the default I/O scheduler. |
35 | 36 | ||
37 | Note: If BLK_CGROUP=m, then CFQ can be built only as module. | ||
38 | |||
36 | config CFQ_GROUP_IOSCHED | 39 | config CFQ_GROUP_IOSCHED |
37 | bool "CFQ Group Scheduling support" | 40 | bool "CFQ Group Scheduling support" |
38 | depends on IOSCHED_CFQ && CGROUPS | 41 | depends on IOSCHED_CFQ && BLK_CGROUP |
39 | default n | 42 | default n |
40 | ---help--- | 43 | ---help--- |
41 | Enable group IO scheduling in CFQ. | 44 | Enable group IO scheduling in CFQ. |
42 | 45 | ||
43 | config DEBUG_CFQ_IOSCHED | ||
44 | bool "Debug CFQ Scheduling" | ||
45 | depends on CFQ_GROUP_IOSCHED | ||
46 | select DEBUG_BLK_CGROUP | ||
47 | default n | ||
48 | ---help--- | ||
49 | Enable CFQ IO scheduling debugging in CFQ. Currently it makes | ||
50 | blktrace output more verbose. | ||
51 | |||
52 | choice | 46 | choice |
53 | prompt "Default I/O scheduler" | 47 | prompt "Default I/O scheduler" |
54 | default DEFAULT_CFQ | 48 | default DEFAULT_CFQ |
diff --git a/block/Makefile b/block/Makefile index cb2d515ebd6e..0bb499a739cd 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -5,7 +5,7 @@ | |||
5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | 5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ |
6 | blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ | 6 | blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ |
7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ | 7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ |
8 | blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o | 8 | blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o |
9 | 9 | ||
10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | 10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
11 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o | 11 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 6d88544b677f..0d710c9d403b 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c | |||
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err) | |||
286 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | 286 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); |
287 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 287 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
288 | } | 288 | } |
289 | 289 | if (bio->bi_private) | |
290 | complete(bio->bi_private); | 290 | complete(bio->bi_private); |
291 | bio_put(bio); | ||
291 | } | 292 | } |
292 | 293 | ||
293 | /** | 294 | /** |
294 | * blkdev_issue_flush - queue a flush | 295 | * blkdev_issue_flush - queue a flush |
295 | * @bdev: blockdev to issue flush for | 296 | * @bdev: blockdev to issue flush for |
297 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
296 | * @error_sector: error sector | 298 | * @error_sector: error sector |
299 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
297 | * | 300 | * |
298 | * Description: | 301 | * Description: |
299 | * Issue a flush for the block device in question. Caller can supply | 302 | * Issue a flush for the block device in question. Caller can supply |
300 | * room for storing the error offset in case of a flush error, if they | 303 | * room for storing the error offset in case of a flush error, if they |
301 | * wish to. | 304 | * wish to. If WAIT flag is not passed then caller may check only what |
305 | * request was pushed in some internal queue for later handling. | ||
302 | */ | 306 | */ |
303 | int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) | 307 | int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, |
308 | sector_t *error_sector, unsigned long flags) | ||
304 | { | 309 | { |
305 | DECLARE_COMPLETION_ONSTACK(wait); | 310 | DECLARE_COMPLETION_ONSTACK(wait); |
306 | struct request_queue *q; | 311 | struct request_queue *q; |
307 | struct bio *bio; | 312 | struct bio *bio; |
308 | int ret; | 313 | int ret = 0; |
309 | 314 | ||
310 | if (bdev->bd_disk == NULL) | 315 | if (bdev->bd_disk == NULL) |
311 | return -ENXIO; | 316 | return -ENXIO; |
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) | |||
314 | if (!q) | 319 | if (!q) |
315 | return -ENXIO; | 320 | return -ENXIO; |
316 | 321 | ||
317 | bio = bio_alloc(GFP_KERNEL, 0); | 322 | bio = bio_alloc(gfp_mask, 0); |
318 | bio->bi_end_io = bio_end_empty_barrier; | 323 | bio->bi_end_io = bio_end_empty_barrier; |
319 | bio->bi_private = &wait; | ||
320 | bio->bi_bdev = bdev; | 324 | bio->bi_bdev = bdev; |
321 | submit_bio(WRITE_BARRIER, bio); | 325 | if (test_bit(BLKDEV_WAIT, &flags)) |
322 | 326 | bio->bi_private = &wait; | |
323 | wait_for_completion(&wait); | ||
324 | 327 | ||
325 | /* | 328 | bio_get(bio); |
326 | * The driver must store the error location in ->bi_sector, if | 329 | submit_bio(WRITE_BARRIER, bio); |
327 | * it supports it. For non-stacked drivers, this should be copied | 330 | if (test_bit(BLKDEV_WAIT, &flags)) { |
328 | * from blk_rq_pos(rq). | 331 | wait_for_completion(&wait); |
329 | */ | 332 | /* |
330 | if (error_sector) | 333 | * The driver must store the error location in ->bi_sector, if |
331 | *error_sector = bio->bi_sector; | 334 | * it supports it. For non-stacked drivers, this should be |
335 | * copied from blk_rq_pos(rq). | ||
336 | */ | ||
337 | if (error_sector) | ||
338 | *error_sector = bio->bi_sector; | ||
339 | } | ||
332 | 340 | ||
333 | ret = 0; | ||
334 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 341 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
335 | ret = -EOPNOTSUPP; | 342 | ret = -EOPNOTSUPP; |
336 | else if (!bio_flagged(bio, BIO_UPTODATE)) | 343 | else if (!bio_flagged(bio, BIO_UPTODATE)) |
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) | |||
340 | return ret; | 347 | return ret; |
341 | } | 348 | } |
342 | EXPORT_SYMBOL(blkdev_issue_flush); | 349 | EXPORT_SYMBOL(blkdev_issue_flush); |
343 | |||
344 | static void blkdev_discard_end_io(struct bio *bio, int err) | ||
345 | { | ||
346 | if (err) { | ||
347 | if (err == -EOPNOTSUPP) | ||
348 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | ||
349 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
350 | } | ||
351 | |||
352 | if (bio->bi_private) | ||
353 | complete(bio->bi_private); | ||
354 | __free_page(bio_page(bio)); | ||
355 | |||
356 | bio_put(bio); | ||
357 | } | ||
358 | |||
359 | /** | ||
360 | * blkdev_issue_discard - queue a discard | ||
361 | * @bdev: blockdev to issue discard for | ||
362 | * @sector: start sector | ||
363 | * @nr_sects: number of sectors to discard | ||
364 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
365 | * @flags: DISCARD_FL_* flags to control behaviour | ||
366 | * | ||
367 | * Description: | ||
368 | * Issue a discard request for the sectors in question. | ||
369 | */ | ||
370 | int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | ||
371 | sector_t nr_sects, gfp_t gfp_mask, int flags) | ||
372 | { | ||
373 | DECLARE_COMPLETION_ONSTACK(wait); | ||
374 | struct request_queue *q = bdev_get_queue(bdev); | ||
375 | int type = flags & DISCARD_FL_BARRIER ? | ||
376 | DISCARD_BARRIER : DISCARD_NOBARRIER; | ||
377 | struct bio *bio; | ||
378 | struct page *page; | ||
379 | int ret = 0; | ||
380 | |||
381 | if (!q) | ||
382 | return -ENXIO; | ||
383 | |||
384 | if (!blk_queue_discard(q)) | ||
385 | return -EOPNOTSUPP; | ||
386 | |||
387 | while (nr_sects && !ret) { | ||
388 | unsigned int sector_size = q->limits.logical_block_size; | ||
389 | unsigned int max_discard_sectors = | ||
390 | min(q->limits.max_discard_sectors, UINT_MAX >> 9); | ||
391 | |||
392 | bio = bio_alloc(gfp_mask, 1); | ||
393 | if (!bio) | ||
394 | goto out; | ||
395 | bio->bi_sector = sector; | ||
396 | bio->bi_end_io = blkdev_discard_end_io; | ||
397 | bio->bi_bdev = bdev; | ||
398 | if (flags & DISCARD_FL_WAIT) | ||
399 | bio->bi_private = &wait; | ||
400 | |||
401 | /* | ||
402 | * Add a zeroed one-sector payload as that's what | ||
403 | * our current implementations need. If we'll ever need | ||
404 | * more the interface will need revisiting. | ||
405 | */ | ||
406 | page = alloc_page(gfp_mask | __GFP_ZERO); | ||
407 | if (!page) | ||
408 | goto out_free_bio; | ||
409 | if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) | ||
410 | goto out_free_page; | ||
411 | |||
412 | /* | ||
413 | * And override the bio size - the way discard works we | ||
414 | * touch many more blocks on disk than the actual payload | ||
415 | * length. | ||
416 | */ | ||
417 | if (nr_sects > max_discard_sectors) { | ||
418 | bio->bi_size = max_discard_sectors << 9; | ||
419 | nr_sects -= max_discard_sectors; | ||
420 | sector += max_discard_sectors; | ||
421 | } else { | ||
422 | bio->bi_size = nr_sects << 9; | ||
423 | nr_sects = 0; | ||
424 | } | ||
425 | |||
426 | bio_get(bio); | ||
427 | submit_bio(type, bio); | ||
428 | |||
429 | if (flags & DISCARD_FL_WAIT) | ||
430 | wait_for_completion(&wait); | ||
431 | |||
432 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | ||
433 | ret = -EOPNOTSUPP; | ||
434 | else if (!bio_flagged(bio, BIO_UPTODATE)) | ||
435 | ret = -EIO; | ||
436 | bio_put(bio); | ||
437 | } | ||
438 | return ret; | ||
439 | out_free_page: | ||
440 | __free_page(page); | ||
441 | out_free_bio: | ||
442 | bio_put(bio); | ||
443 | out: | ||
444 | return -ENOMEM; | ||
445 | } | ||
446 | EXPORT_SYMBOL(blkdev_issue_discard); | ||
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2cc682b860ea..a6809645d212 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -15,8 +15,12 @@ | |||
15 | #include <linux/kdev_t.h> | 15 | #include <linux/kdev_t.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/err.h> | 17 | #include <linux/err.h> |
18 | #include <linux/blkdev.h> | ||
18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
19 | #include "blk-cgroup.h" | 20 | #include "blk-cgroup.h" |
21 | #include <linux/genhd.h> | ||
22 | |||
23 | #define MAX_KEY_LEN 100 | ||
20 | 24 | ||
21 | static DEFINE_SPINLOCK(blkio_list_lock); | 25 | static DEFINE_SPINLOCK(blkio_list_lock); |
22 | static LIST_HEAD(blkio_list); | 26 | static LIST_HEAD(blkio_list); |
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = { | |||
49 | }; | 53 | }; |
50 | EXPORT_SYMBOL_GPL(blkio_subsys); | 54 | EXPORT_SYMBOL_GPL(blkio_subsys); |
51 | 55 | ||
56 | static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, | ||
57 | struct blkio_policy_node *pn) | ||
58 | { | ||
59 | list_add(&pn->node, &blkcg->policy_list); | ||
60 | } | ||
61 | |||
62 | /* Must be called with blkcg->lock held */ | ||
63 | static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) | ||
64 | { | ||
65 | list_del(&pn->node); | ||
66 | } | ||
67 | |||
68 | /* Must be called with blkcg->lock held */ | ||
69 | static struct blkio_policy_node * | ||
70 | blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) | ||
71 | { | ||
72 | struct blkio_policy_node *pn; | ||
73 | |||
74 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
75 | if (pn->dev == dev) | ||
76 | return pn; | ||
77 | } | ||
78 | |||
79 | return NULL; | ||
80 | } | ||
81 | |||
52 | struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) | 82 | struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) |
53 | { | 83 | { |
54 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), | 84 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), |
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) | |||
56 | } | 86 | } |
57 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); | 87 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); |
58 | 88 | ||
59 | void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, | 89 | /* |
60 | unsigned long time, unsigned long sectors) | 90 | * Add to the appropriate stat variable depending on the request type. |
91 | * This should be called with the blkg->stats_lock held. | ||
92 | */ | ||
93 | static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, | ||
94 | bool sync) | ||
95 | { | ||
96 | if (direction) | ||
97 | stat[BLKIO_STAT_WRITE] += add; | ||
98 | else | ||
99 | stat[BLKIO_STAT_READ] += add; | ||
100 | if (sync) | ||
101 | stat[BLKIO_STAT_SYNC] += add; | ||
102 | else | ||
103 | stat[BLKIO_STAT_ASYNC] += add; | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * Decrements the appropriate stat variable if non-zero depending on the | ||
108 | * request type. Panics on value being zero. | ||
109 | * This should be called with the blkg->stats_lock held. | ||
110 | */ | ||
111 | static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) | ||
112 | { | ||
113 | if (direction) { | ||
114 | BUG_ON(stat[BLKIO_STAT_WRITE] == 0); | ||
115 | stat[BLKIO_STAT_WRITE]--; | ||
116 | } else { | ||
117 | BUG_ON(stat[BLKIO_STAT_READ] == 0); | ||
118 | stat[BLKIO_STAT_READ]--; | ||
119 | } | ||
120 | if (sync) { | ||
121 | BUG_ON(stat[BLKIO_STAT_SYNC] == 0); | ||
122 | stat[BLKIO_STAT_SYNC]--; | ||
123 | } else { | ||
124 | BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); | ||
125 | stat[BLKIO_STAT_ASYNC]--; | ||
126 | } | ||
127 | } | ||
128 | |||
129 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
130 | /* This should be called with the blkg->stats_lock held. */ | ||
131 | static void blkio_set_start_group_wait_time(struct blkio_group *blkg, | ||
132 | struct blkio_group *curr_blkg) | ||
133 | { | ||
134 | if (blkio_blkg_waiting(&blkg->stats)) | ||
135 | return; | ||
136 | if (blkg == curr_blkg) | ||
137 | return; | ||
138 | blkg->stats.start_group_wait_time = sched_clock(); | ||
139 | blkio_mark_blkg_waiting(&blkg->stats); | ||
140 | } | ||
141 | |||
142 | /* This should be called with the blkg->stats_lock held. */ | ||
143 | static void blkio_update_group_wait_time(struct blkio_group_stats *stats) | ||
144 | { | ||
145 | unsigned long long now; | ||
146 | |||
147 | if (!blkio_blkg_waiting(stats)) | ||
148 | return; | ||
149 | |||
150 | now = sched_clock(); | ||
151 | if (time_after64(now, stats->start_group_wait_time)) | ||
152 | stats->group_wait_time += now - stats->start_group_wait_time; | ||
153 | blkio_clear_blkg_waiting(stats); | ||
154 | } | ||
155 | |||
156 | /* This should be called with the blkg->stats_lock held. */ | ||
157 | static void blkio_end_empty_time(struct blkio_group_stats *stats) | ||
158 | { | ||
159 | unsigned long long now; | ||
160 | |||
161 | if (!blkio_blkg_empty(stats)) | ||
162 | return; | ||
163 | |||
164 | now = sched_clock(); | ||
165 | if (time_after64(now, stats->start_empty_time)) | ||
166 | stats->empty_time += now - stats->start_empty_time; | ||
167 | blkio_clear_blkg_empty(stats); | ||
168 | } | ||
169 | |||
170 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | ||
171 | { | ||
172 | unsigned long flags; | ||
173 | |||
174 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
175 | BUG_ON(blkio_blkg_idling(&blkg->stats)); | ||
176 | blkg->stats.start_idle_time = sched_clock(); | ||
177 | blkio_mark_blkg_idling(&blkg->stats); | ||
178 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
179 | } | ||
180 | EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); | ||
181 | |||
182 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg) | ||
183 | { | ||
184 | unsigned long flags; | ||
185 | unsigned long long now; | ||
186 | struct blkio_group_stats *stats; | ||
187 | |||
188 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
189 | stats = &blkg->stats; | ||
190 | if (blkio_blkg_idling(stats)) { | ||
191 | now = sched_clock(); | ||
192 | if (time_after64(now, stats->start_idle_time)) | ||
193 | stats->idle_time += now - stats->start_idle_time; | ||
194 | blkio_clear_blkg_idling(stats); | ||
195 | } | ||
196 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
197 | } | ||
198 | EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); | ||
199 | |||
200 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) | ||
201 | { | ||
202 | unsigned long flags; | ||
203 | struct blkio_group_stats *stats; | ||
204 | |||
205 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
206 | stats = &blkg->stats; | ||
207 | stats->avg_queue_size_sum += | ||
208 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + | ||
209 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; | ||
210 | stats->avg_queue_size_samples++; | ||
211 | blkio_update_group_wait_time(stats); | ||
212 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
213 | } | ||
214 | EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); | ||
215 | |||
216 | void blkiocg_set_start_empty_time(struct blkio_group *blkg) | ||
217 | { | ||
218 | unsigned long flags; | ||
219 | struct blkio_group_stats *stats; | ||
220 | |||
221 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
222 | stats = &blkg->stats; | ||
223 | |||
224 | if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || | ||
225 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { | ||
226 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
227 | return; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * group is already marked empty. This can happen if cfqq got new | ||
232 | * request in parent group and moved to this group while being added | ||
233 | * to service tree. Just ignore the event and move on. | ||
234 | */ | ||
235 | if(blkio_blkg_empty(stats)) { | ||
236 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
237 | return; | ||
238 | } | ||
239 | |||
240 | stats->start_empty_time = sched_clock(); | ||
241 | blkio_mark_blkg_empty(stats); | ||
242 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
243 | } | ||
244 | EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); | ||
245 | |||
246 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, | ||
247 | unsigned long dequeue) | ||
248 | { | ||
249 | blkg->stats.dequeue += dequeue; | ||
250 | } | ||
251 | EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); | ||
252 | #else | ||
253 | static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, | ||
254 | struct blkio_group *curr_blkg) {} | ||
255 | static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} | ||
256 | #endif | ||
257 | |||
258 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
259 | struct blkio_group *curr_blkg, bool direction, | ||
260 | bool sync) | ||
261 | { | ||
262 | unsigned long flags; | ||
263 | |||
264 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
265 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, | ||
266 | sync); | ||
267 | blkio_end_empty_time(&blkg->stats); | ||
268 | blkio_set_start_group_wait_time(blkg, curr_blkg); | ||
269 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
270 | } | ||
271 | EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); | ||
272 | |||
273 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
274 | bool direction, bool sync) | ||
275 | { | ||
276 | unsigned long flags; | ||
277 | |||
278 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
279 | blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], | ||
280 | direction, sync); | ||
281 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
282 | } | ||
283 | EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); | ||
284 | |||
285 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) | ||
286 | { | ||
287 | unsigned long flags; | ||
288 | |||
289 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
290 | blkg->stats.time += time; | ||
291 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
292 | } | ||
293 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); | ||
294 | |||
295 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
296 | uint64_t bytes, bool direction, bool sync) | ||
61 | { | 297 | { |
62 | blkg->time += time; | 298 | struct blkio_group_stats *stats; |
63 | blkg->sectors += sectors; | 299 | unsigned long flags; |
300 | |||
301 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
302 | stats = &blkg->stats; | ||
303 | stats->sectors += bytes >> 9; | ||
304 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, | ||
305 | sync); | ||
306 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, | ||
307 | direction, sync); | ||
308 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
64 | } | 309 | } |
65 | EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); | 310 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); |
311 | |||
312 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
313 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) | ||
314 | { | ||
315 | struct blkio_group_stats *stats; | ||
316 | unsigned long flags; | ||
317 | unsigned long long now = sched_clock(); | ||
318 | |||
319 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
320 | stats = &blkg->stats; | ||
321 | if (time_after64(now, io_start_time)) | ||
322 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], | ||
323 | now - io_start_time, direction, sync); | ||
324 | if (time_after64(io_start_time, start_time)) | ||
325 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], | ||
326 | io_start_time - start_time, direction, sync); | ||
327 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
328 | } | ||
329 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); | ||
330 | |||
331 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
332 | bool sync) | ||
333 | { | ||
334 | unsigned long flags; | ||
335 | |||
336 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
337 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, | ||
338 | sync); | ||
339 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
340 | } | ||
341 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); | ||
66 | 342 | ||
67 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 343 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
68 | struct blkio_group *blkg, void *key, dev_t dev) | 344 | struct blkio_group *blkg, void *key, dev_t dev) |
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | |||
70 | unsigned long flags; | 346 | unsigned long flags; |
71 | 347 | ||
72 | spin_lock_irqsave(&blkcg->lock, flags); | 348 | spin_lock_irqsave(&blkcg->lock, flags); |
349 | spin_lock_init(&blkg->stats_lock); | ||
73 | rcu_assign_pointer(blkg->key, key); | 350 | rcu_assign_pointer(blkg->key, key); |
74 | blkg->blkcg_id = css_id(&blkcg->css); | 351 | blkg->blkcg_id = css_id(&blkcg->css); |
75 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | 352 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); |
76 | spin_unlock_irqrestore(&blkcg->lock, flags); | 353 | spin_unlock_irqrestore(&blkcg->lock, flags); |
77 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
78 | /* Need to take css reference ? */ | 354 | /* Need to take css reference ? */ |
79 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); | 355 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); |
80 | #endif | ||
81 | blkg->dev = dev; | 356 | blkg->dev = dev; |
82 | } | 357 | } |
83 | EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); | 358 | EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); |
@@ -101,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg) | |||
101 | 376 | ||
102 | rcu_read_lock(); | 377 | rcu_read_lock(); |
103 | css = css_lookup(&blkio_subsys, blkg->blkcg_id); | 378 | css = css_lookup(&blkio_subsys, blkg->blkcg_id); |
104 | if (!css) | 379 | if (css) { |
105 | goto out; | 380 | blkcg = container_of(css, struct blkio_cgroup, css); |
106 | 381 | spin_lock_irqsave(&blkcg->lock, flags); | |
107 | blkcg = container_of(css, struct blkio_cgroup, css); | 382 | if (!hlist_unhashed(&blkg->blkcg_node)) { |
108 | spin_lock_irqsave(&blkcg->lock, flags); | 383 | __blkiocg_del_blkio_group(blkg); |
109 | if (!hlist_unhashed(&blkg->blkcg_node)) { | 384 | ret = 0; |
110 | __blkiocg_del_blkio_group(blkg); | 385 | } |
111 | ret = 0; | 386 | spin_unlock_irqrestore(&blkcg->lock, flags); |
112 | } | 387 | } |
113 | spin_unlock_irqrestore(&blkcg->lock, flags); | 388 | |
114 | out: | ||
115 | rcu_read_unlock(); | 389 | rcu_read_unlock(); |
116 | return ret; | 390 | return ret; |
117 | } | 391 | } |
@@ -154,6 +428,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
154 | struct blkio_group *blkg; | 428 | struct blkio_group *blkg; |
155 | struct hlist_node *n; | 429 | struct hlist_node *n; |
156 | struct blkio_policy_type *blkiop; | 430 | struct blkio_policy_type *blkiop; |
431 | struct blkio_policy_node *pn; | ||
157 | 432 | ||
158 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) | 433 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) |
159 | return -EINVAL; | 434 | return -EINVAL; |
@@ -162,7 +437,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
162 | spin_lock(&blkio_list_lock); | 437 | spin_lock(&blkio_list_lock); |
163 | spin_lock_irq(&blkcg->lock); | 438 | spin_lock_irq(&blkcg->lock); |
164 | blkcg->weight = (unsigned int)val; | 439 | blkcg->weight = (unsigned int)val; |
440 | |||
165 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | 441 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { |
442 | pn = blkio_policy_search_node(blkcg, blkg->dev); | ||
443 | |||
444 | if (pn) | ||
445 | continue; | ||
446 | |||
166 | list_for_each_entry(blkiop, &blkio_list, list) | 447 | list_for_each_entry(blkiop, &blkio_list, list) |
167 | blkiop->ops.blkio_update_group_weight_fn(blkg, | 448 | blkiop->ops.blkio_update_group_weight_fn(blkg, |
168 | blkcg->weight); | 449 | blkcg->weight); |
@@ -172,13 +453,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
172 | return 0; | 453 | return 0; |
173 | } | 454 | } |
174 | 455 | ||
175 | #define SHOW_FUNCTION_PER_GROUP(__VAR) \ | 456 | static int |
457 | blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) | ||
458 | { | ||
459 | struct blkio_cgroup *blkcg; | ||
460 | struct blkio_group *blkg; | ||
461 | struct blkio_group_stats *stats; | ||
462 | struct hlist_node *n; | ||
463 | uint64_t queued[BLKIO_STAT_TOTAL]; | ||
464 | int i; | ||
465 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
466 | bool idling, waiting, empty; | ||
467 | unsigned long long now = sched_clock(); | ||
468 | #endif | ||
469 | |||
470 | blkcg = cgroup_to_blkio_cgroup(cgroup); | ||
471 | spin_lock_irq(&blkcg->lock); | ||
472 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
473 | spin_lock(&blkg->stats_lock); | ||
474 | stats = &blkg->stats; | ||
475 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
476 | idling = blkio_blkg_idling(stats); | ||
477 | waiting = blkio_blkg_waiting(stats); | ||
478 | empty = blkio_blkg_empty(stats); | ||
479 | #endif | ||
480 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
481 | queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; | ||
482 | memset(stats, 0, sizeof(struct blkio_group_stats)); | ||
483 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
484 | stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; | ||
485 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
486 | if (idling) { | ||
487 | blkio_mark_blkg_idling(stats); | ||
488 | stats->start_idle_time = now; | ||
489 | } | ||
490 | if (waiting) { | ||
491 | blkio_mark_blkg_waiting(stats); | ||
492 | stats->start_group_wait_time = now; | ||
493 | } | ||
494 | if (empty) { | ||
495 | blkio_mark_blkg_empty(stats); | ||
496 | stats->start_empty_time = now; | ||
497 | } | ||
498 | #endif | ||
499 | spin_unlock(&blkg->stats_lock); | ||
500 | } | ||
501 | spin_unlock_irq(&blkcg->lock); | ||
502 | return 0; | ||
503 | } | ||
504 | |||
505 | static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, | ||
506 | int chars_left, bool diskname_only) | ||
507 | { | ||
508 | snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); | ||
509 | chars_left -= strlen(str); | ||
510 | if (chars_left <= 0) { | ||
511 | printk(KERN_WARNING | ||
512 | "Possibly incorrect cgroup stat display format"); | ||
513 | return; | ||
514 | } | ||
515 | if (diskname_only) | ||
516 | return; | ||
517 | switch (type) { | ||
518 | case BLKIO_STAT_READ: | ||
519 | strlcat(str, " Read", chars_left); | ||
520 | break; | ||
521 | case BLKIO_STAT_WRITE: | ||
522 | strlcat(str, " Write", chars_left); | ||
523 | break; | ||
524 | case BLKIO_STAT_SYNC: | ||
525 | strlcat(str, " Sync", chars_left); | ||
526 | break; | ||
527 | case BLKIO_STAT_ASYNC: | ||
528 | strlcat(str, " Async", chars_left); | ||
529 | break; | ||
530 | case BLKIO_STAT_TOTAL: | ||
531 | strlcat(str, " Total", chars_left); | ||
532 | break; | ||
533 | default: | ||
534 | strlcat(str, " Invalid", chars_left); | ||
535 | } | ||
536 | } | ||
537 | |||
538 | static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, | ||
539 | struct cgroup_map_cb *cb, dev_t dev) | ||
540 | { | ||
541 | blkio_get_key_name(0, dev, str, chars_left, true); | ||
542 | cb->fill(cb, str, val); | ||
543 | return val; | ||
544 | } | ||
545 | |||
546 | /* This should be called with blkg->stats_lock held */ | ||
547 | static uint64_t blkio_get_stat(struct blkio_group *blkg, | ||
548 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) | ||
549 | { | ||
550 | uint64_t disk_total; | ||
551 | char key_str[MAX_KEY_LEN]; | ||
552 | enum stat_sub_type sub_type; | ||
553 | |||
554 | if (type == BLKIO_STAT_TIME) | ||
555 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
556 | blkg->stats.time, cb, dev); | ||
557 | if (type == BLKIO_STAT_SECTORS) | ||
558 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
559 | blkg->stats.sectors, cb, dev); | ||
560 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
561 | if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { | ||
562 | uint64_t sum = blkg->stats.avg_queue_size_sum; | ||
563 | uint64_t samples = blkg->stats.avg_queue_size_samples; | ||
564 | if (samples) | ||
565 | do_div(sum, samples); | ||
566 | else | ||
567 | sum = 0; | ||
568 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); | ||
569 | } | ||
570 | if (type == BLKIO_STAT_GROUP_WAIT_TIME) | ||
571 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
572 | blkg->stats.group_wait_time, cb, dev); | ||
573 | if (type == BLKIO_STAT_IDLE_TIME) | ||
574 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
575 | blkg->stats.idle_time, cb, dev); | ||
576 | if (type == BLKIO_STAT_EMPTY_TIME) | ||
577 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
578 | blkg->stats.empty_time, cb, dev); | ||
579 | if (type == BLKIO_STAT_DEQUEUE) | ||
580 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
581 | blkg->stats.dequeue, cb, dev); | ||
582 | #endif | ||
583 | |||
584 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; | ||
585 | sub_type++) { | ||
586 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | ||
587 | cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); | ||
588 | } | ||
589 | disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + | ||
590 | blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; | ||
591 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
592 | cb->fill(cb, key_str, disk_total); | ||
593 | return disk_total; | ||
594 | } | ||
595 | |||
596 | #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ | ||
176 | static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | 597 | static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ |
177 | struct cftype *cftype, struct seq_file *m) \ | 598 | struct cftype *cftype, struct cgroup_map_cb *cb) \ |
178 | { \ | 599 | { \ |
179 | struct blkio_cgroup *blkcg; \ | 600 | struct blkio_cgroup *blkcg; \ |
180 | struct blkio_group *blkg; \ | 601 | struct blkio_group *blkg; \ |
181 | struct hlist_node *n; \ | 602 | struct hlist_node *n; \ |
603 | uint64_t cgroup_total = 0; \ | ||
182 | \ | 604 | \ |
183 | if (!cgroup_lock_live_group(cgroup)) \ | 605 | if (!cgroup_lock_live_group(cgroup)) \ |
184 | return -ENODEV; \ | 606 | return -ENODEV; \ |
@@ -186,50 +608,293 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | |||
186 | blkcg = cgroup_to_blkio_cgroup(cgroup); \ | 608 | blkcg = cgroup_to_blkio_cgroup(cgroup); \ |
187 | rcu_read_lock(); \ | 609 | rcu_read_lock(); \ |
188 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ | 610 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ |
189 | if (blkg->dev) \ | 611 | if (blkg->dev) { \ |
190 | seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ | 612 | spin_lock_irq(&blkg->stats_lock); \ |
191 | MINOR(blkg->dev), blkg->__VAR); \ | 613 | cgroup_total += blkio_get_stat(blkg, cb, \ |
614 | blkg->dev, type); \ | ||
615 | spin_unlock_irq(&blkg->stats_lock); \ | ||
616 | } \ | ||
192 | } \ | 617 | } \ |
618 | if (show_total) \ | ||
619 | cb->fill(cb, "Total", cgroup_total); \ | ||
193 | rcu_read_unlock(); \ | 620 | rcu_read_unlock(); \ |
194 | cgroup_unlock(); \ | 621 | cgroup_unlock(); \ |
195 | return 0; \ | 622 | return 0; \ |
196 | } | 623 | } |
197 | 624 | ||
198 | SHOW_FUNCTION_PER_GROUP(time); | 625 | SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); |
199 | SHOW_FUNCTION_PER_GROUP(sectors); | 626 | SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); |
627 | SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); | ||
628 | SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); | ||
629 | SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); | ||
630 | SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); | ||
631 | SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); | ||
632 | SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); | ||
200 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 633 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
201 | SHOW_FUNCTION_PER_GROUP(dequeue); | 634 | SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); |
635 | SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); | ||
636 | SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); | ||
637 | SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); | ||
638 | SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); | ||
202 | #endif | 639 | #endif |
203 | #undef SHOW_FUNCTION_PER_GROUP | 640 | #undef SHOW_FUNCTION_PER_GROUP |
204 | 641 | ||
205 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 642 | static int blkio_check_dev_num(dev_t dev) |
206 | void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, | ||
207 | unsigned long dequeue) | ||
208 | { | 643 | { |
209 | blkg->dequeue += dequeue; | 644 | int part = 0; |
645 | struct gendisk *disk; | ||
646 | |||
647 | disk = get_gendisk(dev, &part); | ||
648 | if (!disk || part) | ||
649 | return -ENODEV; | ||
650 | |||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | static int blkio_policy_parse_and_set(char *buf, | ||
655 | struct blkio_policy_node *newpn) | ||
656 | { | ||
657 | char *s[4], *p, *major_s = NULL, *minor_s = NULL; | ||
658 | int ret; | ||
659 | unsigned long major, minor, temp; | ||
660 | int i = 0; | ||
661 | dev_t dev; | ||
662 | |||
663 | memset(s, 0, sizeof(s)); | ||
664 | |||
665 | while ((p = strsep(&buf, " ")) != NULL) { | ||
666 | if (!*p) | ||
667 | continue; | ||
668 | |||
669 | s[i++] = p; | ||
670 | |||
671 | /* Prevent from inputing too many things */ | ||
672 | if (i == 3) | ||
673 | break; | ||
674 | } | ||
675 | |||
676 | if (i != 2) | ||
677 | return -EINVAL; | ||
678 | |||
679 | p = strsep(&s[0], ":"); | ||
680 | if (p != NULL) | ||
681 | major_s = p; | ||
682 | else | ||
683 | return -EINVAL; | ||
684 | |||
685 | minor_s = s[0]; | ||
686 | if (!minor_s) | ||
687 | return -EINVAL; | ||
688 | |||
689 | ret = strict_strtoul(major_s, 10, &major); | ||
690 | if (ret) | ||
691 | return -EINVAL; | ||
692 | |||
693 | ret = strict_strtoul(minor_s, 10, &minor); | ||
694 | if (ret) | ||
695 | return -EINVAL; | ||
696 | |||
697 | dev = MKDEV(major, minor); | ||
698 | |||
699 | ret = blkio_check_dev_num(dev); | ||
700 | if (ret) | ||
701 | return ret; | ||
702 | |||
703 | newpn->dev = dev; | ||
704 | |||
705 | if (s[1] == NULL) | ||
706 | return -EINVAL; | ||
707 | |||
708 | ret = strict_strtoul(s[1], 10, &temp); | ||
709 | if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || | ||
710 | temp > BLKIO_WEIGHT_MAX) | ||
711 | return -EINVAL; | ||
712 | |||
713 | newpn->weight = temp; | ||
714 | |||
715 | return 0; | ||
716 | } | ||
717 | |||
718 | unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | ||
719 | dev_t dev) | ||
720 | { | ||
721 | struct blkio_policy_node *pn; | ||
722 | |||
723 | pn = blkio_policy_search_node(blkcg, dev); | ||
724 | if (pn) | ||
725 | return pn->weight; | ||
726 | else | ||
727 | return blkcg->weight; | ||
728 | } | ||
729 | EXPORT_SYMBOL_GPL(blkcg_get_weight); | ||
730 | |||
731 | |||
732 | static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, | ||
733 | const char *buffer) | ||
734 | { | ||
735 | int ret = 0; | ||
736 | char *buf; | ||
737 | struct blkio_policy_node *newpn, *pn; | ||
738 | struct blkio_cgroup *blkcg; | ||
739 | struct blkio_group *blkg; | ||
740 | int keep_newpn = 0; | ||
741 | struct hlist_node *n; | ||
742 | struct blkio_policy_type *blkiop; | ||
743 | |||
744 | buf = kstrdup(buffer, GFP_KERNEL); | ||
745 | if (!buf) | ||
746 | return -ENOMEM; | ||
747 | |||
748 | newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); | ||
749 | if (!newpn) { | ||
750 | ret = -ENOMEM; | ||
751 | goto free_buf; | ||
752 | } | ||
753 | |||
754 | ret = blkio_policy_parse_and_set(buf, newpn); | ||
755 | if (ret) | ||
756 | goto free_newpn; | ||
757 | |||
758 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
759 | |||
760 | spin_lock_irq(&blkcg->lock); | ||
761 | |||
762 | pn = blkio_policy_search_node(blkcg, newpn->dev); | ||
763 | if (!pn) { | ||
764 | if (newpn->weight != 0) { | ||
765 | blkio_policy_insert_node(blkcg, newpn); | ||
766 | keep_newpn = 1; | ||
767 | } | ||
768 | spin_unlock_irq(&blkcg->lock); | ||
769 | goto update_io_group; | ||
770 | } | ||
771 | |||
772 | if (newpn->weight == 0) { | ||
773 | /* weight == 0 means deleteing a specific weight */ | ||
774 | blkio_policy_delete_node(pn); | ||
775 | spin_unlock_irq(&blkcg->lock); | ||
776 | goto update_io_group; | ||
777 | } | ||
778 | spin_unlock_irq(&blkcg->lock); | ||
779 | |||
780 | pn->weight = newpn->weight; | ||
781 | |||
782 | update_io_group: | ||
783 | /* update weight for each cfqg */ | ||
784 | spin_lock(&blkio_list_lock); | ||
785 | spin_lock_irq(&blkcg->lock); | ||
786 | |||
787 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
788 | if (newpn->dev == blkg->dev) { | ||
789 | list_for_each_entry(blkiop, &blkio_list, list) | ||
790 | blkiop->ops.blkio_update_group_weight_fn(blkg, | ||
791 | newpn->weight ? | ||
792 | newpn->weight : | ||
793 | blkcg->weight); | ||
794 | } | ||
795 | } | ||
796 | |||
797 | spin_unlock_irq(&blkcg->lock); | ||
798 | spin_unlock(&blkio_list_lock); | ||
799 | |||
800 | free_newpn: | ||
801 | if (!keep_newpn) | ||
802 | kfree(newpn); | ||
803 | free_buf: | ||
804 | kfree(buf); | ||
805 | return ret; | ||
806 | } | ||
807 | |||
808 | static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, | ||
809 | struct seq_file *m) | ||
810 | { | ||
811 | struct blkio_cgroup *blkcg; | ||
812 | struct blkio_policy_node *pn; | ||
813 | |||
814 | seq_printf(m, "dev\tweight\n"); | ||
815 | |||
816 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
817 | if (!list_empty(&blkcg->policy_list)) { | ||
818 | spin_lock_irq(&blkcg->lock); | ||
819 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
820 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | ||
821 | MINOR(pn->dev), pn->weight); | ||
822 | } | ||
823 | spin_unlock_irq(&blkcg->lock); | ||
824 | } | ||
825 | |||
826 | return 0; | ||
210 | } | 827 | } |
211 | EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats); | ||
212 | #endif | ||
213 | 828 | ||
214 | struct cftype blkio_files[] = { | 829 | struct cftype blkio_files[] = { |
215 | { | 830 | { |
831 | .name = "weight_device", | ||
832 | .read_seq_string = blkiocg_weight_device_read, | ||
833 | .write_string = blkiocg_weight_device_write, | ||
834 | .max_write_len = 256, | ||
835 | }, | ||
836 | { | ||
216 | .name = "weight", | 837 | .name = "weight", |
217 | .read_u64 = blkiocg_weight_read, | 838 | .read_u64 = blkiocg_weight_read, |
218 | .write_u64 = blkiocg_weight_write, | 839 | .write_u64 = blkiocg_weight_write, |
219 | }, | 840 | }, |
220 | { | 841 | { |
221 | .name = "time", | 842 | .name = "time", |
222 | .read_seq_string = blkiocg_time_read, | 843 | .read_map = blkiocg_time_read, |
223 | }, | 844 | }, |
224 | { | 845 | { |
225 | .name = "sectors", | 846 | .name = "sectors", |
226 | .read_seq_string = blkiocg_sectors_read, | 847 | .read_map = blkiocg_sectors_read, |
848 | }, | ||
849 | { | ||
850 | .name = "io_service_bytes", | ||
851 | .read_map = blkiocg_io_service_bytes_read, | ||
852 | }, | ||
853 | { | ||
854 | .name = "io_serviced", | ||
855 | .read_map = blkiocg_io_serviced_read, | ||
856 | }, | ||
857 | { | ||
858 | .name = "io_service_time", | ||
859 | .read_map = blkiocg_io_service_time_read, | ||
860 | }, | ||
861 | { | ||
862 | .name = "io_wait_time", | ||
863 | .read_map = blkiocg_io_wait_time_read, | ||
864 | }, | ||
865 | { | ||
866 | .name = "io_merged", | ||
867 | .read_map = blkiocg_io_merged_read, | ||
868 | }, | ||
869 | { | ||
870 | .name = "io_queued", | ||
871 | .read_map = blkiocg_io_queued_read, | ||
872 | }, | ||
873 | { | ||
874 | .name = "reset_stats", | ||
875 | .write_u64 = blkiocg_reset_stats, | ||
227 | }, | 876 | }, |
228 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 877 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
229 | { | 878 | { |
879 | .name = "avg_queue_size", | ||
880 | .read_map = blkiocg_avg_queue_size_read, | ||
881 | }, | ||
882 | { | ||
883 | .name = "group_wait_time", | ||
884 | .read_map = blkiocg_group_wait_time_read, | ||
885 | }, | ||
886 | { | ||
887 | .name = "idle_time", | ||
888 | .read_map = blkiocg_idle_time_read, | ||
889 | }, | ||
890 | { | ||
891 | .name = "empty_time", | ||
892 | .read_map = blkiocg_empty_time_read, | ||
893 | }, | ||
894 | { | ||
230 | .name = "dequeue", | 895 | .name = "dequeue", |
231 | .read_seq_string = blkiocg_dequeue_read, | 896 | .read_map = blkiocg_dequeue_read, |
232 | }, | 897 | }, |
233 | #endif | 898 | #endif |
234 | }; | 899 | }; |
235 | 900 | ||
@@ -246,37 +911,42 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) | |||
246 | struct blkio_group *blkg; | 911 | struct blkio_group *blkg; |
247 | void *key; | 912 | void *key; |
248 | struct blkio_policy_type *blkiop; | 913 | struct blkio_policy_type *blkiop; |
914 | struct blkio_policy_node *pn, *pntmp; | ||
249 | 915 | ||
250 | rcu_read_lock(); | 916 | rcu_read_lock(); |
251 | remove_entry: | 917 | do { |
252 | spin_lock_irqsave(&blkcg->lock, flags); | 918 | spin_lock_irqsave(&blkcg->lock, flags); |
919 | |||
920 | if (hlist_empty(&blkcg->blkg_list)) { | ||
921 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
922 | break; | ||
923 | } | ||
924 | |||
925 | blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, | ||
926 | blkcg_node); | ||
927 | key = rcu_dereference(blkg->key); | ||
928 | __blkiocg_del_blkio_group(blkg); | ||
253 | 929 | ||
254 | if (hlist_empty(&blkcg->blkg_list)) { | ||
255 | spin_unlock_irqrestore(&blkcg->lock, flags); | 930 | spin_unlock_irqrestore(&blkcg->lock, flags); |
256 | goto done; | ||
257 | } | ||
258 | 931 | ||
259 | blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, | 932 | /* |
260 | blkcg_node); | 933 | * This blkio_group is being unlinked as associated cgroup is |
261 | key = rcu_dereference(blkg->key); | 934 | * going away. Let all the IO controlling policies know about |
262 | __blkiocg_del_blkio_group(blkg); | 935 | * this event. Currently this is static call to one io |
936 | * controlling policy. Once we have more policies in place, we | ||
937 | * need some dynamic registration of callback function. | ||
938 | */ | ||
939 | spin_lock(&blkio_list_lock); | ||
940 | list_for_each_entry(blkiop, &blkio_list, list) | ||
941 | blkiop->ops.blkio_unlink_group_fn(key, blkg); | ||
942 | spin_unlock(&blkio_list_lock); | ||
943 | } while (1); | ||
263 | 944 | ||
264 | spin_unlock_irqrestore(&blkcg->lock, flags); | 945 | list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { |
946 | blkio_policy_delete_node(pn); | ||
947 | kfree(pn); | ||
948 | } | ||
265 | 949 | ||
266 | /* | ||
267 | * This blkio_group is being unlinked as associated cgroup is going | ||
268 | * away. Let all the IO controlling policies know about this event. | ||
269 | * | ||
270 | * Currently this is static call to one io controlling policy. Once | ||
271 | * we have more policies in place, we need some dynamic registration | ||
272 | * of callback function. | ||
273 | */ | ||
274 | spin_lock(&blkio_list_lock); | ||
275 | list_for_each_entry(blkiop, &blkio_list, list) | ||
276 | blkiop->ops.blkio_unlink_group_fn(key, blkg); | ||
277 | spin_unlock(&blkio_list_lock); | ||
278 | goto remove_entry; | ||
279 | done: | ||
280 | free_css_id(&blkio_subsys, &blkcg->css); | 950 | free_css_id(&blkio_subsys, &blkcg->css); |
281 | rcu_read_unlock(); | 951 | rcu_read_unlock(); |
282 | if (blkcg != &blkio_root_cgroup) | 952 | if (blkcg != &blkio_root_cgroup) |
@@ -307,6 +977,7 @@ done: | |||
307 | spin_lock_init(&blkcg->lock); | 977 | spin_lock_init(&blkcg->lock); |
308 | INIT_HLIST_HEAD(&blkcg->blkg_list); | 978 | INIT_HLIST_HEAD(&blkcg->blkg_list); |
309 | 979 | ||
980 | INIT_LIST_HEAD(&blkcg->policy_list); | ||
310 | return &blkcg->css; | 981 | return &blkcg->css; |
311 | } | 982 | } |
312 | 983 | ||
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8ccc20464dae..2b866ec1dcea 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys; | |||
23 | #define blkio_subsys_id blkio_subsys.subsys_id | 23 | #define blkio_subsys_id blkio_subsys.subsys_id |
24 | #endif | 24 | #endif |
25 | 25 | ||
26 | enum stat_type { | ||
27 | /* Total time spent (in ns) between request dispatch to the driver and | ||
28 | * request completion for IOs doen by this cgroup. This may not be | ||
29 | * accurate when NCQ is turned on. */ | ||
30 | BLKIO_STAT_SERVICE_TIME = 0, | ||
31 | /* Total bytes transferred */ | ||
32 | BLKIO_STAT_SERVICE_BYTES, | ||
33 | /* Total IOs serviced, post merge */ | ||
34 | BLKIO_STAT_SERVICED, | ||
35 | /* Total time spent waiting in scheduler queue in ns */ | ||
36 | BLKIO_STAT_WAIT_TIME, | ||
37 | /* Number of IOs merged */ | ||
38 | BLKIO_STAT_MERGED, | ||
39 | /* Number of IOs queued up */ | ||
40 | BLKIO_STAT_QUEUED, | ||
41 | /* All the single valued stats go below this */ | ||
42 | BLKIO_STAT_TIME, | ||
43 | BLKIO_STAT_SECTORS, | ||
44 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
45 | BLKIO_STAT_AVG_QUEUE_SIZE, | ||
46 | BLKIO_STAT_IDLE_TIME, | ||
47 | BLKIO_STAT_EMPTY_TIME, | ||
48 | BLKIO_STAT_GROUP_WAIT_TIME, | ||
49 | BLKIO_STAT_DEQUEUE | ||
50 | #endif | ||
51 | }; | ||
52 | |||
53 | enum stat_sub_type { | ||
54 | BLKIO_STAT_READ = 0, | ||
55 | BLKIO_STAT_WRITE, | ||
56 | BLKIO_STAT_SYNC, | ||
57 | BLKIO_STAT_ASYNC, | ||
58 | BLKIO_STAT_TOTAL | ||
59 | }; | ||
60 | |||
61 | /* blkg state flags */ | ||
62 | enum blkg_state_flags { | ||
63 | BLKG_waiting = 0, | ||
64 | BLKG_idling, | ||
65 | BLKG_empty, | ||
66 | }; | ||
67 | |||
26 | struct blkio_cgroup { | 68 | struct blkio_cgroup { |
27 | struct cgroup_subsys_state css; | 69 | struct cgroup_subsys_state css; |
28 | unsigned int weight; | 70 | unsigned int weight; |
29 | spinlock_t lock; | 71 | spinlock_t lock; |
30 | struct hlist_head blkg_list; | 72 | struct hlist_head blkg_list; |
73 | struct list_head policy_list; /* list of blkio_policy_node */ | ||
74 | }; | ||
75 | |||
76 | struct blkio_group_stats { | ||
77 | /* total disk time and nr sectors dispatched by this group */ | ||
78 | uint64_t time; | ||
79 | uint64_t sectors; | ||
80 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; | ||
81 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
82 | /* Sum of number of IOs queued across all samples */ | ||
83 | uint64_t avg_queue_size_sum; | ||
84 | /* Count of samples taken for average */ | ||
85 | uint64_t avg_queue_size_samples; | ||
86 | /* How many times this group has been removed from service tree */ | ||
87 | unsigned long dequeue; | ||
88 | |||
89 | /* Total time spent waiting for it to be assigned a timeslice. */ | ||
90 | uint64_t group_wait_time; | ||
91 | uint64_t start_group_wait_time; | ||
92 | |||
93 | /* Time spent idling for this blkio_group */ | ||
94 | uint64_t idle_time; | ||
95 | uint64_t start_idle_time; | ||
96 | /* | ||
97 | * Total time when we have requests queued and do not contain the | ||
98 | * current active queue. | ||
99 | */ | ||
100 | uint64_t empty_time; | ||
101 | uint64_t start_empty_time; | ||
102 | uint16_t flags; | ||
103 | #endif | ||
31 | }; | 104 | }; |
32 | 105 | ||
33 | struct blkio_group { | 106 | struct blkio_group { |
@@ -35,20 +108,25 @@ struct blkio_group { | |||
35 | void *key; | 108 | void *key; |
36 | struct hlist_node blkcg_node; | 109 | struct hlist_node blkcg_node; |
37 | unsigned short blkcg_id; | 110 | unsigned short blkcg_id; |
38 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
39 | /* Store cgroup path */ | 111 | /* Store cgroup path */ |
40 | char path[128]; | 112 | char path[128]; |
41 | /* How many times this group has been removed from service tree */ | ||
42 | unsigned long dequeue; | ||
43 | #endif | ||
44 | /* The device MKDEV(major, minor), this group has been created for */ | 113 | /* The device MKDEV(major, minor), this group has been created for */ |
45 | dev_t dev; | 114 | dev_t dev; |
46 | 115 | ||
47 | /* total disk time and nr sectors dispatched by this group */ | 116 | /* Need to serialize the stats in the case of reset/update */ |
48 | unsigned long time; | 117 | spinlock_t stats_lock; |
49 | unsigned long sectors; | 118 | struct blkio_group_stats stats; |
50 | }; | 119 | }; |
51 | 120 | ||
121 | struct blkio_policy_node { | ||
122 | struct list_head node; | ||
123 | dev_t dev; | ||
124 | unsigned int weight; | ||
125 | }; | ||
126 | |||
127 | extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | ||
128 | dev_t dev); | ||
129 | |||
52 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); | 130 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); |
53 | typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, | 131 | typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, |
54 | unsigned int weight); | 132 | unsigned int weight); |
@@ -67,6 +145,11 @@ struct blkio_policy_type { | |||
67 | extern void blkio_policy_register(struct blkio_policy_type *); | 145 | extern void blkio_policy_register(struct blkio_policy_type *); |
68 | extern void blkio_policy_unregister(struct blkio_policy_type *); | 146 | extern void blkio_policy_unregister(struct blkio_policy_type *); |
69 | 147 | ||
148 | static inline char *blkg_path(struct blkio_group *blkg) | ||
149 | { | ||
150 | return blkg->path; | ||
151 | } | ||
152 | |||
70 | #else | 153 | #else |
71 | 154 | ||
72 | struct blkio_group { | 155 | struct blkio_group { |
@@ -78,6 +161,8 @@ struct blkio_policy_type { | |||
78 | static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } | 161 | static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } |
79 | static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } | 162 | static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } |
80 | 163 | ||
164 | static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } | ||
165 | |||
81 | #endif | 166 | #endif |
82 | 167 | ||
83 | #define BLKIO_WEIGHT_MIN 100 | 168 | #define BLKIO_WEIGHT_MIN 100 |
@@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } | |||
85 | #define BLKIO_WEIGHT_DEFAULT 500 | 170 | #define BLKIO_WEIGHT_DEFAULT 500 |
86 | 171 | ||
87 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 172 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
88 | static inline char *blkg_path(struct blkio_group *blkg) | 173 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); |
89 | { | 174 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, |
90 | return blkg->path; | ||
91 | } | ||
92 | void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, | ||
93 | unsigned long dequeue); | 175 | unsigned long dequeue); |
176 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); | ||
177 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg); | ||
178 | void blkiocg_set_start_empty_time(struct blkio_group *blkg); | ||
179 | |||
180 | #define BLKG_FLAG_FNS(name) \ | ||
181 | static inline void blkio_mark_blkg_##name( \ | ||
182 | struct blkio_group_stats *stats) \ | ||
183 | { \ | ||
184 | stats->flags |= (1 << BLKG_##name); \ | ||
185 | } \ | ||
186 | static inline void blkio_clear_blkg_##name( \ | ||
187 | struct blkio_group_stats *stats) \ | ||
188 | { \ | ||
189 | stats->flags &= ~(1 << BLKG_##name); \ | ||
190 | } \ | ||
191 | static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ | ||
192 | { \ | ||
193 | return (stats->flags & (1 << BLKG_##name)) != 0; \ | ||
194 | } \ | ||
195 | |||
196 | BLKG_FLAG_FNS(waiting) | ||
197 | BLKG_FLAG_FNS(idling) | ||
198 | BLKG_FLAG_FNS(empty) | ||
199 | #undef BLKG_FLAG_FNS | ||
94 | #else | 200 | #else |
95 | static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } | 201 | static inline void blkiocg_update_avg_queue_size_stats( |
96 | static inline void blkiocg_update_blkio_group_dequeue_stats( | 202 | struct blkio_group *blkg) {} |
97 | struct blkio_group *blkg, unsigned long dequeue) {} | 203 | static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, |
204 | unsigned long dequeue) {} | ||
205 | static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | ||
206 | {} | ||
207 | static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} | ||
208 | static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} | ||
98 | #endif | 209 | #endif |
99 | 210 | ||
100 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | 211 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) |
@@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | |||
105 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); | 216 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); |
106 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, | 217 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, |
107 | void *key); | 218 | void *key); |
108 | void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, | 219 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, |
109 | unsigned long time, unsigned long sectors); | 220 | unsigned long time); |
221 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, | ||
222 | bool direction, bool sync); | ||
223 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
224 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); | ||
225 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
226 | bool sync); | ||
227 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
228 | struct blkio_group *curr_blkg, bool direction, bool sync); | ||
229 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
230 | bool direction, bool sync); | ||
110 | #else | 231 | #else |
111 | struct cgroup; | 232 | struct cgroup; |
112 | static inline struct blkio_cgroup * | 233 | static inline struct blkio_cgroup * |
113 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } | 234 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } |
114 | 235 | ||
115 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 236 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
116 | struct blkio_group *blkg, void *key, dev_t dev) | 237 | struct blkio_group *blkg, void *key, dev_t dev) {} |
117 | { | ||
118 | } | ||
119 | 238 | ||
120 | static inline int | 239 | static inline int |
121 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } | 240 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } |
122 | 241 | ||
123 | static inline struct blkio_group * | 242 | static inline struct blkio_group * |
124 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } | 243 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } |
125 | static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, | 244 | static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, |
126 | unsigned long time, unsigned long sectors) | 245 | unsigned long time) {} |
127 | { | 246 | static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, |
128 | } | 247 | uint64_t bytes, bool direction, bool sync) {} |
248 | static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
249 | uint64_t start_time, uint64_t io_start_time, bool direction, | ||
250 | bool sync) {} | ||
251 | static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, | ||
252 | bool direction, bool sync) {} | ||
253 | static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
254 | struct blkio_group *curr_blkg, bool direction, bool sync) {} | ||
255 | static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
256 | bool direction, bool sync) {} | ||
129 | #endif | 257 | #endif |
130 | #endif /* _BLK_CGROUP_H */ | 258 | #endif /* _BLK_CGROUP_H */ |
diff --git a/block/blk-core.c b/block/blk-core.c index 9fe174dc74d1..3bc5579d6f54 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |||
127 | rq->tag = -1; | 127 | rq->tag = -1; |
128 | rq->ref_count = 1; | 128 | rq->ref_count = 1; |
129 | rq->start_time = jiffies; | 129 | rq->start_time = jiffies; |
130 | set_start_time_ns(rq); | ||
130 | } | 131 | } |
131 | EXPORT_SYMBOL(blk_rq_init); | 132 | EXPORT_SYMBOL(blk_rq_init); |
132 | 133 | ||
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q) | |||
450 | */ | 451 | */ |
451 | blk_sync_queue(q); | 452 | blk_sync_queue(q); |
452 | 453 | ||
454 | del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); | ||
453 | mutex_lock(&q->sysfs_lock); | 455 | mutex_lock(&q->sysfs_lock); |
454 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); | 456 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); |
455 | mutex_unlock(&q->sysfs_lock); | 457 | mutex_unlock(&q->sysfs_lock); |
@@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
510 | return NULL; | 512 | return NULL; |
511 | } | 513 | } |
512 | 514 | ||
515 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, | ||
516 | laptop_mode_timer_fn, (unsigned long) q); | ||
513 | init_timer(&q->unplug_timer); | 517 | init_timer(&q->unplug_timer); |
514 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); | 518 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); |
515 | INIT_LIST_HEAD(&q->timeout_list); | 519 | INIT_LIST_HEAD(&q->timeout_list); |
@@ -568,6 +572,22 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) | |||
568 | { | 572 | { |
569 | struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); | 573 | struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); |
570 | 574 | ||
575 | return blk_init_allocated_queue_node(q, rfn, lock, node_id); | ||
576 | } | ||
577 | EXPORT_SYMBOL(blk_init_queue_node); | ||
578 | |||
579 | struct request_queue * | ||
580 | blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, | ||
581 | spinlock_t *lock) | ||
582 | { | ||
583 | return blk_init_allocated_queue_node(q, rfn, lock, -1); | ||
584 | } | ||
585 | EXPORT_SYMBOL(blk_init_allocated_queue); | ||
586 | |||
587 | struct request_queue * | ||
588 | blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, | ||
589 | spinlock_t *lock, int node_id) | ||
590 | { | ||
571 | if (!q) | 591 | if (!q) |
572 | return NULL; | 592 | return NULL; |
573 | 593 | ||
@@ -601,7 +621,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) | |||
601 | blk_put_queue(q); | 621 | blk_put_queue(q); |
602 | return NULL; | 622 | return NULL; |
603 | } | 623 | } |
604 | EXPORT_SYMBOL(blk_init_queue_node); | 624 | EXPORT_SYMBOL(blk_init_allocated_queue_node); |
605 | 625 | ||
606 | int blk_get_queue(struct request_queue *q) | 626 | int blk_get_queue(struct request_queue *q) |
607 | { | 627 | { |
@@ -1198,6 +1218,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) | |||
1198 | if (!blk_rq_cpu_valid(req)) | 1218 | if (!blk_rq_cpu_valid(req)) |
1199 | req->cpu = bio->bi_comp_cpu; | 1219 | req->cpu = bio->bi_comp_cpu; |
1200 | drive_stat_acct(req, 0); | 1220 | drive_stat_acct(req, 0); |
1221 | elv_bio_merged(q, req, bio); | ||
1201 | if (!attempt_back_merge(q, req)) | 1222 | if (!attempt_back_merge(q, req)) |
1202 | elv_merged_request(q, req, el_ret); | 1223 | elv_merged_request(q, req, el_ret); |
1203 | goto out; | 1224 | goto out; |
@@ -1231,6 +1252,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) | |||
1231 | if (!blk_rq_cpu_valid(req)) | 1252 | if (!blk_rq_cpu_valid(req)) |
1232 | req->cpu = bio->bi_comp_cpu; | 1253 | req->cpu = bio->bi_comp_cpu; |
1233 | drive_stat_acct(req, 0); | 1254 | drive_stat_acct(req, 0); |
1255 | elv_bio_merged(q, req, bio); | ||
1234 | if (!attempt_front_merge(q, req)) | 1256 | if (!attempt_front_merge(q, req)) |
1235 | elv_merged_request(q, req, el_ret); | 1257 | elv_merged_request(q, req, el_ret); |
1236 | goto out; | 1258 | goto out; |
@@ -1855,8 +1877,10 @@ void blk_dequeue_request(struct request *rq) | |||
1855 | * and to it is freed is accounted as io that is in progress at | 1877 | * and to it is freed is accounted as io that is in progress at |
1856 | * the driver side. | 1878 | * the driver side. |
1857 | */ | 1879 | */ |
1858 | if (blk_account_rq(rq)) | 1880 | if (blk_account_rq(rq)) { |
1859 | q->in_flight[rq_is_sync(rq)]++; | 1881 | q->in_flight[rq_is_sync(rq)]++; |
1882 | set_io_start_time_ns(rq); | ||
1883 | } | ||
1860 | } | 1884 | } |
1861 | 1885 | ||
1862 | /** | 1886 | /** |
@@ -2098,7 +2122,7 @@ static void blk_finish_request(struct request *req, int error) | |||
2098 | BUG_ON(blk_queued_rq(req)); | 2122 | BUG_ON(blk_queued_rq(req)); |
2099 | 2123 | ||
2100 | if (unlikely(laptop_mode) && blk_fs_request(req)) | 2124 | if (unlikely(laptop_mode) && blk_fs_request(req)) |
2101 | laptop_io_completion(); | 2125 | laptop_io_completion(&req->q->backing_dev_info); |
2102 | 2126 | ||
2103 | blk_delete_timer(req); | 2127 | blk_delete_timer(req); |
2104 | 2128 | ||
@@ -2517,4 +2541,3 @@ int __init blk_dev_init(void) | |||
2517 | 2541 | ||
2518 | return 0; | 2542 | return 0; |
2519 | } | 2543 | } |
2520 | |||
diff --git a/block/blk-lib.c b/block/blk-lib.c new file mode 100644 index 000000000000..d0216b9f22d4 --- /dev/null +++ b/block/blk-lib.c | |||
@@ -0,0 +1,233 @@ | |||
1 | /* | ||
2 | * Functions related to generic helpers functions | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/bio.h> | ||
7 | #include <linux/blkdev.h> | ||
8 | #include <linux/scatterlist.h> | ||
9 | |||
10 | #include "blk.h" | ||
11 | |||
12 | static void blkdev_discard_end_io(struct bio *bio, int err) | ||
13 | { | ||
14 | if (err) { | ||
15 | if (err == -EOPNOTSUPP) | ||
16 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | ||
17 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
18 | } | ||
19 | |||
20 | if (bio->bi_private) | ||
21 | complete(bio->bi_private); | ||
22 | __free_page(bio_page(bio)); | ||
23 | |||
24 | bio_put(bio); | ||
25 | } | ||
26 | |||
27 | /** | ||
28 | * blkdev_issue_discard - queue a discard | ||
29 | * @bdev: blockdev to issue discard for | ||
30 | * @sector: start sector | ||
31 | * @nr_sects: number of sectors to discard | ||
32 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
33 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
34 | * | ||
35 | * Description: | ||
36 | * Issue a discard request for the sectors in question. | ||
37 | */ | ||
38 | int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | ||
39 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) | ||
40 | { | ||
41 | DECLARE_COMPLETION_ONSTACK(wait); | ||
42 | struct request_queue *q = bdev_get_queue(bdev); | ||
43 | int type = flags & BLKDEV_IFL_BARRIER ? | ||
44 | DISCARD_BARRIER : DISCARD_NOBARRIER; | ||
45 | struct bio *bio; | ||
46 | struct page *page; | ||
47 | int ret = 0; | ||
48 | |||
49 | if (!q) | ||
50 | return -ENXIO; | ||
51 | |||
52 | if (!blk_queue_discard(q)) | ||
53 | return -EOPNOTSUPP; | ||
54 | |||
55 | while (nr_sects && !ret) { | ||
56 | unsigned int sector_size = q->limits.logical_block_size; | ||
57 | unsigned int max_discard_sectors = | ||
58 | min(q->limits.max_discard_sectors, UINT_MAX >> 9); | ||
59 | |||
60 | bio = bio_alloc(gfp_mask, 1); | ||
61 | if (!bio) | ||
62 | goto out; | ||
63 | bio->bi_sector = sector; | ||
64 | bio->bi_end_io = blkdev_discard_end_io; | ||
65 | bio->bi_bdev = bdev; | ||
66 | if (flags & BLKDEV_IFL_WAIT) | ||
67 | bio->bi_private = &wait; | ||
68 | |||
69 | /* | ||
70 | * Add a zeroed one-sector payload as that's what | ||
71 | * our current implementations need. If we'll ever need | ||
72 | * more the interface will need revisiting. | ||
73 | */ | ||
74 | page = alloc_page(gfp_mask | __GFP_ZERO); | ||
75 | if (!page) | ||
76 | goto out_free_bio; | ||
77 | if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) | ||
78 | goto out_free_page; | ||
79 | |||
80 | /* | ||
81 | * And override the bio size - the way discard works we | ||
82 | * touch many more blocks on disk than the actual payload | ||
83 | * length. | ||
84 | */ | ||
85 | if (nr_sects > max_discard_sectors) { | ||
86 | bio->bi_size = max_discard_sectors << 9; | ||
87 | nr_sects -= max_discard_sectors; | ||
88 | sector += max_discard_sectors; | ||
89 | } else { | ||
90 | bio->bi_size = nr_sects << 9; | ||
91 | nr_sects = 0; | ||
92 | } | ||
93 | |||
94 | bio_get(bio); | ||
95 | submit_bio(type, bio); | ||
96 | |||
97 | if (flags & BLKDEV_IFL_WAIT) | ||
98 | wait_for_completion(&wait); | ||
99 | |||
100 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | ||
101 | ret = -EOPNOTSUPP; | ||
102 | else if (!bio_flagged(bio, BIO_UPTODATE)) | ||
103 | ret = -EIO; | ||
104 | bio_put(bio); | ||
105 | } | ||
106 | return ret; | ||
107 | out_free_page: | ||
108 | __free_page(page); | ||
109 | out_free_bio: | ||
110 | bio_put(bio); | ||
111 | out: | ||
112 | return -ENOMEM; | ||
113 | } | ||
114 | EXPORT_SYMBOL(blkdev_issue_discard); | ||
115 | |||
116 | struct bio_batch | ||
117 | { | ||
118 | atomic_t done; | ||
119 | unsigned long flags; | ||
120 | struct completion *wait; | ||
121 | bio_end_io_t *end_io; | ||
122 | }; | ||
123 | |||
124 | static void bio_batch_end_io(struct bio *bio, int err) | ||
125 | { | ||
126 | struct bio_batch *bb = bio->bi_private; | ||
127 | |||
128 | if (err) { | ||
129 | if (err == -EOPNOTSUPP) | ||
130 | set_bit(BIO_EOPNOTSUPP, &bb->flags); | ||
131 | else | ||
132 | clear_bit(BIO_UPTODATE, &bb->flags); | ||
133 | } | ||
134 | if (bb) { | ||
135 | if (bb->end_io) | ||
136 | bb->end_io(bio, err); | ||
137 | atomic_inc(&bb->done); | ||
138 | complete(bb->wait); | ||
139 | } | ||
140 | bio_put(bio); | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * blkdev_issue_zeroout generate number of zero filed write bios | ||
145 | * @bdev: blockdev to issue | ||
146 | * @sector: start sector | ||
147 | * @nr_sects: number of sectors to write | ||
148 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
149 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
150 | * | ||
151 | * Description: | ||
152 | * Generate and issue number of bios with zerofiled pages. | ||
153 | * Send barrier at the beginning and at the end if requested. This guarantie | ||
154 | * correct request ordering. Empty barrier allow us to avoid post queue flush. | ||
155 | */ | ||
156 | |||
157 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | ||
158 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) | ||
159 | { | ||
160 | int ret = 0; | ||
161 | struct bio *bio; | ||
162 | struct bio_batch bb; | ||
163 | unsigned int sz, issued = 0; | ||
164 | DECLARE_COMPLETION_ONSTACK(wait); | ||
165 | |||
166 | atomic_set(&bb.done, 0); | ||
167 | bb.flags = 1 << BIO_UPTODATE; | ||
168 | bb.wait = &wait; | ||
169 | bb.end_io = NULL; | ||
170 | |||
171 | if (flags & BLKDEV_IFL_BARRIER) { | ||
172 | /* issue async barrier before the data */ | ||
173 | ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); | ||
174 | if (ret) | ||
175 | return ret; | ||
176 | } | ||
177 | submit: | ||
178 | while (nr_sects != 0) { | ||
179 | bio = bio_alloc(gfp_mask, | ||
180 | min(nr_sects, (sector_t)BIO_MAX_PAGES)); | ||
181 | if (!bio) | ||
182 | break; | ||
183 | |||
184 | bio->bi_sector = sector; | ||
185 | bio->bi_bdev = bdev; | ||
186 | bio->bi_end_io = bio_batch_end_io; | ||
187 | if (flags & BLKDEV_IFL_WAIT) | ||
188 | bio->bi_private = &bb; | ||
189 | |||
190 | while (nr_sects != 0) { | ||
191 | sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); | ||
192 | if (sz == 0) | ||
193 | /* bio has maximum size possible */ | ||
194 | break; | ||
195 | ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); | ||
196 | nr_sects -= ret >> 9; | ||
197 | sector += ret >> 9; | ||
198 | if (ret < (sz << 9)) | ||
199 | break; | ||
200 | } | ||
201 | issued++; | ||
202 | submit_bio(WRITE, bio); | ||
203 | } | ||
204 | /* | ||
205 | * When all data bios are in flight. Send final barrier if requeted. | ||
206 | */ | ||
207 | if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) | ||
208 | ret = blkdev_issue_flush(bdev, gfp_mask, NULL, | ||
209 | flags & BLKDEV_IFL_WAIT); | ||
210 | |||
211 | |||
212 | if (flags & BLKDEV_IFL_WAIT) | ||
213 | /* Wait for bios in-flight */ | ||
214 | while ( issued != atomic_read(&bb.done)) | ||
215 | wait_for_completion(&wait); | ||
216 | |||
217 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | ||
218 | /* One of bios in the batch was completed with error.*/ | ||
219 | ret = -EIO; | ||
220 | |||
221 | if (ret) | ||
222 | goto out; | ||
223 | |||
224 | if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { | ||
225 | ret = -EOPNOTSUPP; | ||
226 | goto out; | ||
227 | } | ||
228 | if (nr_sects != 0) | ||
229 | goto submit; | ||
230 | out: | ||
231 | return ret; | ||
232 | } | ||
233 | EXPORT_SYMBOL(blkdev_issue_zeroout); | ||
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5f127cfb2e92..ed897b5ef315 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4; | |||
55 | #define RQ_CIC(rq) \ | 55 | #define RQ_CIC(rq) \ |
56 | ((struct cfq_io_context *) (rq)->elevator_private) | 56 | ((struct cfq_io_context *) (rq)->elevator_private) |
57 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) | 57 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) |
58 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) | ||
58 | 59 | ||
59 | static struct kmem_cache *cfq_pool; | 60 | static struct kmem_cache *cfq_pool; |
60 | static struct kmem_cache *cfq_ioc_pool; | 61 | static struct kmem_cache *cfq_ioc_pool; |
@@ -143,8 +144,6 @@ struct cfq_queue { | |||
143 | struct cfq_queue *new_cfqq; | 144 | struct cfq_queue *new_cfqq; |
144 | struct cfq_group *cfqg; | 145 | struct cfq_group *cfqg; |
145 | struct cfq_group *orig_cfqg; | 146 | struct cfq_group *orig_cfqg; |
146 | /* Sectors dispatched in current dispatch round */ | ||
147 | unsigned long nr_sectors; | ||
148 | }; | 147 | }; |
149 | 148 | ||
150 | /* | 149 | /* |
@@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep); | |||
346 | CFQ_CFQQ_FNS(wait_busy); | 345 | CFQ_CFQQ_FNS(wait_busy); |
347 | #undef CFQ_CFQQ_FNS | 346 | #undef CFQ_CFQQ_FNS |
348 | 347 | ||
349 | #ifdef CONFIG_DEBUG_CFQ_IOSCHED | 348 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
350 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 349 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ |
351 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ | 350 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ |
352 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | 351 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ |
@@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
858 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | 857 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) |
859 | cfq_rb_erase(&cfqg->rb_node, st); | 858 | cfq_rb_erase(&cfqg->rb_node, st); |
860 | cfqg->saved_workload_slice = 0; | 859 | cfqg->saved_workload_slice = 0; |
861 | blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); | 860 | blkiocg_update_dequeue_stats(&cfqg->blkg, 1); |
862 | } | 861 | } |
863 | 862 | ||
864 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) | 863 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) |
@@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) | |||
884 | slice_used = cfqq->allocated_slice; | 883 | slice_used = cfqq->allocated_slice; |
885 | } | 884 | } |
886 | 885 | ||
887 | cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, | 886 | cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); |
888 | cfqq->nr_sectors); | ||
889 | return slice_used; | 887 | return slice_used; |
890 | } | 888 | } |
891 | 889 | ||
@@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
919 | 917 | ||
920 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, | 918 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, |
921 | st->min_vdisktime); | 919 | st->min_vdisktime); |
922 | blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, | 920 | blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); |
923 | cfqq->nr_sectors); | 921 | blkiocg_set_start_empty_time(&cfqg->blkg); |
924 | } | 922 | } |
925 | 923 | ||
926 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 924 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
@@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) | |||
961 | if (!cfqg) | 959 | if (!cfqg) |
962 | goto done; | 960 | goto done; |
963 | 961 | ||
964 | cfqg->weight = blkcg->weight; | ||
965 | for_each_cfqg_st(cfqg, i, j, st) | 962 | for_each_cfqg_st(cfqg, i, j, st) |
966 | *st = CFQ_RB_ROOT; | 963 | *st = CFQ_RB_ROOT; |
967 | RB_CLEAR_NODE(&cfqg->rb_node); | 964 | RB_CLEAR_NODE(&cfqg->rb_node); |
@@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) | |||
978 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 975 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
979 | blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | 976 | blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, |
980 | MKDEV(major, minor)); | 977 | MKDEV(major, minor)); |
978 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
981 | 979 | ||
982 | /* Add group on cfqd list */ | 980 | /* Add group on cfqd list */ |
983 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | 981 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); |
@@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | |||
1004 | return cfqg; | 1002 | return cfqg; |
1005 | } | 1003 | } |
1006 | 1004 | ||
1005 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | ||
1006 | { | ||
1007 | atomic_inc(&cfqg->ref); | ||
1008 | return cfqg; | ||
1009 | } | ||
1010 | |||
1007 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | 1011 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) |
1008 | { | 1012 | { |
1009 | /* Currently, all async queues are mapped to root group */ | 1013 | /* Currently, all async queues are mapped to root group */ |
@@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | |||
1087 | { | 1091 | { |
1088 | return &cfqd->root_group; | 1092 | return &cfqd->root_group; |
1089 | } | 1093 | } |
1094 | |||
1095 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | ||
1096 | { | ||
1097 | return cfqg; | ||
1098 | } | ||
1099 | |||
1090 | static inline void | 1100 | static inline void |
1091 | cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { | 1101 | cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { |
1092 | cfqq->cfqg = cfqg; | 1102 | cfqq->cfqg = cfqg; |
@@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) | |||
1389 | { | 1399 | { |
1390 | elv_rb_del(&cfqq->sort_list, rq); | 1400 | elv_rb_del(&cfqq->sort_list, rq); |
1391 | cfqq->queued[rq_is_sync(rq)]--; | 1401 | cfqq->queued[rq_is_sync(rq)]--; |
1402 | blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), | ||
1403 | rq_is_sync(rq)); | ||
1392 | cfq_add_rq_rb(rq); | 1404 | cfq_add_rq_rb(rq); |
1405 | blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, | ||
1406 | &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), | ||
1407 | rq_is_sync(rq)); | ||
1393 | } | 1408 | } |
1394 | 1409 | ||
1395 | static struct request * | 1410 | static struct request * |
@@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq) | |||
1445 | cfq_del_rq_rb(rq); | 1460 | cfq_del_rq_rb(rq); |
1446 | 1461 | ||
1447 | cfqq->cfqd->rq_queued--; | 1462 | cfqq->cfqd->rq_queued--; |
1463 | blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), | ||
1464 | rq_is_sync(rq)); | ||
1448 | if (rq_is_meta(rq)) { | 1465 | if (rq_is_meta(rq)) { |
1449 | WARN_ON(!cfqq->meta_pending); | 1466 | WARN_ON(!cfqq->meta_pending); |
1450 | cfqq->meta_pending--; | 1467 | cfqq->meta_pending--; |
@@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, | |||
1476 | } | 1493 | } |
1477 | } | 1494 | } |
1478 | 1495 | ||
1496 | static void cfq_bio_merged(struct request_queue *q, struct request *req, | ||
1497 | struct bio *bio) | ||
1498 | { | ||
1499 | blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio), | ||
1500 | cfq_bio_sync(bio)); | ||
1501 | } | ||
1502 | |||
1479 | static void | 1503 | static void |
1480 | cfq_merged_requests(struct request_queue *q, struct request *rq, | 1504 | cfq_merged_requests(struct request_queue *q, struct request *rq, |
1481 | struct request *next) | 1505 | struct request *next) |
@@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, | |||
1493 | if (cfqq->next_rq == next) | 1517 | if (cfqq->next_rq == next) |
1494 | cfqq->next_rq = rq; | 1518 | cfqq->next_rq = rq; |
1495 | cfq_remove_request(next); | 1519 | cfq_remove_request(next); |
1520 | blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next), | ||
1521 | rq_is_sync(next)); | ||
1496 | } | 1522 | } |
1497 | 1523 | ||
1498 | static int cfq_allow_merge(struct request_queue *q, struct request *rq, | 1524 | static int cfq_allow_merge(struct request_queue *q, struct request *rq, |
@@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, | |||
1520 | return cfqq == RQ_CFQQ(rq); | 1546 | return cfqq == RQ_CFQQ(rq); |
1521 | } | 1547 | } |
1522 | 1548 | ||
1549 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
1550 | { | ||
1551 | del_timer(&cfqd->idle_slice_timer); | ||
1552 | blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); | ||
1553 | } | ||
1554 | |||
1523 | static void __cfq_set_active_queue(struct cfq_data *cfqd, | 1555 | static void __cfq_set_active_queue(struct cfq_data *cfqd, |
1524 | struct cfq_queue *cfqq) | 1556 | struct cfq_queue *cfqq) |
1525 | { | 1557 | { |
1526 | if (cfqq) { | 1558 | if (cfqq) { |
1527 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", | 1559 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", |
1528 | cfqd->serving_prio, cfqd->serving_type); | 1560 | cfqd->serving_prio, cfqd->serving_type); |
1561 | blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); | ||
1529 | cfqq->slice_start = 0; | 1562 | cfqq->slice_start = 0; |
1530 | cfqq->dispatch_start = jiffies; | 1563 | cfqq->dispatch_start = jiffies; |
1531 | cfqq->allocated_slice = 0; | 1564 | cfqq->allocated_slice = 0; |
1532 | cfqq->slice_end = 0; | 1565 | cfqq->slice_end = 0; |
1533 | cfqq->slice_dispatch = 0; | 1566 | cfqq->slice_dispatch = 0; |
1534 | cfqq->nr_sectors = 0; | ||
1535 | 1567 | ||
1536 | cfq_clear_cfqq_wait_request(cfqq); | 1568 | cfq_clear_cfqq_wait_request(cfqq); |
1537 | cfq_clear_cfqq_must_dispatch(cfqq); | 1569 | cfq_clear_cfqq_must_dispatch(cfqq); |
@@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, | |||
1539 | cfq_clear_cfqq_fifo_expire(cfqq); | 1571 | cfq_clear_cfqq_fifo_expire(cfqq); |
1540 | cfq_mark_cfqq_slice_new(cfqq); | 1572 | cfq_mark_cfqq_slice_new(cfqq); |
1541 | 1573 | ||
1542 | del_timer(&cfqd->idle_slice_timer); | 1574 | cfq_del_timer(cfqd, cfqq); |
1543 | } | 1575 | } |
1544 | 1576 | ||
1545 | cfqd->active_queue = cfqq; | 1577 | cfqd->active_queue = cfqq; |
@@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1555 | cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); | 1587 | cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); |
1556 | 1588 | ||
1557 | if (cfq_cfqq_wait_request(cfqq)) | 1589 | if (cfq_cfqq_wait_request(cfqq)) |
1558 | del_timer(&cfqd->idle_slice_timer); | 1590 | cfq_del_timer(cfqd, cfqq); |
1559 | 1591 | ||
1560 | cfq_clear_cfqq_wait_request(cfqq); | 1592 | cfq_clear_cfqq_wait_request(cfqq); |
1561 | cfq_clear_cfqq_wait_busy(cfqq); | 1593 | cfq_clear_cfqq_wait_busy(cfqq); |
@@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
1857 | sl = cfqd->cfq_slice_idle; | 1889 | sl = cfqd->cfq_slice_idle; |
1858 | 1890 | ||
1859 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); | 1891 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); |
1892 | blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); | ||
1860 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); | 1893 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); |
1861 | } | 1894 | } |
1862 | 1895 | ||
@@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) | |||
1876 | elv_dispatch_sort(q, rq); | 1909 | elv_dispatch_sort(q, rq); |
1877 | 1910 | ||
1878 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; | 1911 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; |
1879 | cfqq->nr_sectors += blk_rq_sectors(rq); | 1912 | blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), |
1913 | rq_data_dir(rq), rq_is_sync(rq)); | ||
1880 | } | 1914 | } |
1881 | 1915 | ||
1882 | /* | 1916 | /* |
@@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3185 | if (cfq_cfqq_wait_request(cfqq)) { | 3219 | if (cfq_cfqq_wait_request(cfqq)) { |
3186 | if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || | 3220 | if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || |
3187 | cfqd->busy_queues > 1) { | 3221 | cfqd->busy_queues > 1) { |
3188 | del_timer(&cfqd->idle_slice_timer); | 3222 | cfq_del_timer(cfqd, cfqq); |
3189 | cfq_clear_cfqq_wait_request(cfqq); | 3223 | cfq_clear_cfqq_wait_request(cfqq); |
3190 | __blk_run_queue(cfqd->queue); | 3224 | __blk_run_queue(cfqd->queue); |
3191 | } else | 3225 | } else { |
3226 | blkiocg_update_idle_time_stats( | ||
3227 | &cfqq->cfqg->blkg); | ||
3192 | cfq_mark_cfqq_must_dispatch(cfqq); | 3228 | cfq_mark_cfqq_must_dispatch(cfqq); |
3229 | } | ||
3193 | } | 3230 | } |
3194 | } else if (cfq_should_preempt(cfqd, cfqq, rq)) { | 3231 | } else if (cfq_should_preempt(cfqd, cfqq, rq)) { |
3195 | /* | 3232 | /* |
@@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) | |||
3214 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); | 3251 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); |
3215 | list_add_tail(&rq->queuelist, &cfqq->fifo); | 3252 | list_add_tail(&rq->queuelist, &cfqq->fifo); |
3216 | cfq_add_rq_rb(rq); | 3253 | cfq_add_rq_rb(rq); |
3217 | 3254 | blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, | |
3255 | &cfqd->serving_group->blkg, rq_data_dir(rq), | ||
3256 | rq_is_sync(rq)); | ||
3218 | cfq_rq_enqueued(cfqd, cfqq, rq); | 3257 | cfq_rq_enqueued(cfqd, cfqq, rq); |
3219 | } | 3258 | } |
3220 | 3259 | ||
@@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) | |||
3300 | WARN_ON(!cfqq->dispatched); | 3339 | WARN_ON(!cfqq->dispatched); |
3301 | cfqd->rq_in_driver--; | 3340 | cfqd->rq_in_driver--; |
3302 | cfqq->dispatched--; | 3341 | cfqq->dispatched--; |
3342 | blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq), | ||
3343 | rq_io_start_time_ns(rq), rq_data_dir(rq), | ||
3344 | rq_is_sync(rq)); | ||
3303 | 3345 | ||
3304 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; | 3346 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; |
3305 | 3347 | ||
@@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq) | |||
3440 | rq->elevator_private = NULL; | 3482 | rq->elevator_private = NULL; |
3441 | rq->elevator_private2 = NULL; | 3483 | rq->elevator_private2 = NULL; |
3442 | 3484 | ||
3485 | /* Put down rq reference on cfqg */ | ||
3486 | cfq_put_cfqg(RQ_CFQG(rq)); | ||
3487 | rq->elevator_private3 = NULL; | ||
3488 | |||
3443 | cfq_put_queue(cfqq); | 3489 | cfq_put_queue(cfqq); |
3444 | } | 3490 | } |
3445 | } | 3491 | } |
@@ -3528,6 +3574,7 @@ new_queue: | |||
3528 | 3574 | ||
3529 | rq->elevator_private = cic; | 3575 | rq->elevator_private = cic; |
3530 | rq->elevator_private2 = cfqq; | 3576 | rq->elevator_private2 = cfqq; |
3577 | rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); | ||
3531 | return 0; | 3578 | return 0; |
3532 | 3579 | ||
3533 | queue_fail: | 3580 | queue_fail: |
@@ -3743,7 +3790,6 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3743 | * second, in order to have larger depth for async operations. | 3790 | * second, in order to have larger depth for async operations. |
3744 | */ | 3791 | */ |
3745 | cfqd->last_delayed_sync = jiffies - HZ; | 3792 | cfqd->last_delayed_sync = jiffies - HZ; |
3746 | INIT_RCU_HEAD(&cfqd->rcu); | ||
3747 | return cfqd; | 3793 | return cfqd; |
3748 | } | 3794 | } |
3749 | 3795 | ||
@@ -3872,6 +3918,7 @@ static struct elevator_type iosched_cfq = { | |||
3872 | .elevator_merged_fn = cfq_merged_request, | 3918 | .elevator_merged_fn = cfq_merged_request, |
3873 | .elevator_merge_req_fn = cfq_merged_requests, | 3919 | .elevator_merge_req_fn = cfq_merged_requests, |
3874 | .elevator_allow_merge_fn = cfq_allow_merge, | 3920 | .elevator_allow_merge_fn = cfq_allow_merge, |
3921 | .elevator_bio_merged_fn = cfq_bio_merged, | ||
3875 | .elevator_dispatch_fn = cfq_dispatch_requests, | 3922 | .elevator_dispatch_fn = cfq_dispatch_requests, |
3876 | .elevator_add_req_fn = cfq_insert_request, | 3923 | .elevator_add_req_fn = cfq_insert_request, |
3877 | .elevator_activate_req_fn = cfq_activate_request, | 3924 | .elevator_activate_req_fn = cfq_activate_request, |
diff --git a/block/elevator.c b/block/elevator.c index 76e3702d5381..6df2b5056b51 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, | |||
539 | q->last_merge = rq; | 539 | q->last_merge = rq; |
540 | } | 540 | } |
541 | 541 | ||
542 | void elv_bio_merged(struct request_queue *q, struct request *rq, | ||
543 | struct bio *bio) | ||
544 | { | ||
545 | struct elevator_queue *e = q->elevator; | ||
546 | |||
547 | if (e->ops->elevator_bio_merged_fn) | ||
548 | e->ops->elevator_bio_merged_fn(q, rq, bio); | ||
549 | } | ||
550 | |||
542 | void elv_requeue_request(struct request_queue *q, struct request *rq) | 551 | void elv_requeue_request(struct request_queue *q, struct request *rq) |
543 | { | 552 | { |
544 | /* | 553 | /* |
@@ -921,6 +930,7 @@ int elv_register_queue(struct request_queue *q) | |||
921 | } | 930 | } |
922 | return error; | 931 | return error; |
923 | } | 932 | } |
933 | EXPORT_SYMBOL(elv_register_queue); | ||
924 | 934 | ||
925 | static void __elv_unregister_queue(struct elevator_queue *e) | 935 | static void __elv_unregister_queue(struct elevator_queue *e) |
926 | { | 936 | { |
@@ -933,6 +943,7 @@ void elv_unregister_queue(struct request_queue *q) | |||
933 | if (q) | 943 | if (q) |
934 | __elv_unregister_queue(q->elevator); | 944 | __elv_unregister_queue(q->elevator); |
935 | } | 945 | } |
946 | EXPORT_SYMBOL(elv_unregister_queue); | ||
936 | 947 | ||
937 | void elv_register(struct elevator_type *e) | 948 | void elv_register(struct elevator_type *e) |
938 | { | 949 | { |
diff --git a/block/genhd.c b/block/genhd.c index d13ba76a169c..59a2db6fecef 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) | |||
596 | 596 | ||
597 | return disk; | 597 | return disk; |
598 | } | 598 | } |
599 | EXPORT_SYMBOL(get_gendisk); | ||
599 | 600 | ||
600 | /** | 601 | /** |
601 | * bdget_disk - do bdget() by gendisk and partition number | 602 | * bdget_disk - do bdget() by gendisk and partition number |
@@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) | |||
987 | if (!new_ptbl) | 988 | if (!new_ptbl) |
988 | return -ENOMEM; | 989 | return -ENOMEM; |
989 | 990 | ||
990 | INIT_RCU_HEAD(&new_ptbl->rcu_head); | ||
991 | new_ptbl->len = target; | 991 | new_ptbl->len = target; |
992 | 992 | ||
993 | for (i = 0; i < len; i++) | 993 | for (i = 0; i < len; i++) |
diff --git a/block/ioctl.c b/block/ioctl.c index 8905d2a2a717..e8eb679f2f9b 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, | |||
126 | if (start + len > (bdev->bd_inode->i_size >> 9)) | 126 | if (start + len > (bdev->bd_inode->i_size >> 9)) |
127 | return -EINVAL; | 127 | return -EINVAL; |
128 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, | 128 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, |
129 | DISCARD_FL_WAIT); | 129 | BLKDEV_IFL_WAIT); |
130 | } | 130 | } |
131 | 131 | ||
132 | static int put_ushort(unsigned long arg, unsigned short val) | 132 | static int put_ushort(unsigned long arg, unsigned short val) |
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 77bfce52e9ca..de277689da61 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -76,6 +76,17 @@ config BLK_DEV_XD | |||
76 | 76 | ||
77 | It's pretty unlikely that you have one of these: say N. | 77 | It's pretty unlikely that you have one of these: say N. |
78 | 78 | ||
79 | config GDROM | ||
80 | tristate "SEGA Dreamcast GD-ROM drive" | ||
81 | depends on SH_DREAMCAST | ||
82 | help | ||
83 | A standard SEGA Dreamcast comes with a modified CD ROM drive called a | ||
84 | "GD-ROM" by SEGA to signify it is capable of reading special disks | ||
85 | with up to 1 GB of data. This drive will also read standard CD ROM | ||
86 | disks. Select this option to access any disks in your GD ROM drive. | ||
87 | Most users will want to say "Y" here. | ||
88 | You can also build this as a module which will be called gdrom. | ||
89 | |||
79 | config PARIDE | 90 | config PARIDE |
80 | tristate "Parallel port IDE device support" | 91 | tristate "Parallel port IDE device support" |
81 | depends on PARPORT_PC | 92 | depends on PARPORT_PC |
@@ -103,17 +114,6 @@ config PARIDE | |||
103 | "MicroSolutions backpack protocol", "DataStor Commuter protocol" | 114 | "MicroSolutions backpack protocol", "DataStor Commuter protocol" |
104 | etc.). | 115 | etc.). |
105 | 116 | ||
106 | config GDROM | ||
107 | tristate "SEGA Dreamcast GD-ROM drive" | ||
108 | depends on SH_DREAMCAST | ||
109 | help | ||
110 | A standard SEGA Dreamcast comes with a modified CD ROM drive called a | ||
111 | "GD-ROM" by SEGA to signify it is capable of reading special disks | ||
112 | with up to 1 GB of data. This drive will also read standard CD ROM | ||
113 | disks. Select this option to access any disks in your GD ROM drive. | ||
114 | Most users will want to say "Y" here. | ||
115 | You can also build this as a module which will be called gdrom. | ||
116 | |||
117 | source "drivers/block/paride/Kconfig" | 117 | source "drivers/block/paride/Kconfig" |
118 | 118 | ||
119 | config BLK_CPQ_DA | 119 | config BLK_CPQ_DA |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 3390716898d5..e3f88d6e1412 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -84,6 +84,9 @@ struct drbd_bitmap { | |||
84 | #define BM_MD_IO_ERROR 1 | 84 | #define BM_MD_IO_ERROR 1 |
85 | #define BM_P_VMALLOCED 2 | 85 | #define BM_P_VMALLOCED 2 |
86 | 86 | ||
87 | static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | ||
88 | unsigned long e, int val, const enum km_type km); | ||
89 | |||
87 | static int bm_is_locked(struct drbd_bitmap *b) | 90 | static int bm_is_locked(struct drbd_bitmap *b) |
88 | { | 91 | { |
89 | return test_bit(BM_LOCKED, &b->bm_flags); | 92 | return test_bit(BM_LOCKED, &b->bm_flags); |
@@ -441,7 +444,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | |||
441 | * In case this is actually a resize, we copy the old bitmap into the new one. | 444 | * In case this is actually a resize, we copy the old bitmap into the new one. |
442 | * Otherwise, the bitmap is initialized to all bits set. | 445 | * Otherwise, the bitmap is initialized to all bits set. |
443 | */ | 446 | */ |
444 | int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) | 447 | int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) |
445 | { | 448 | { |
446 | struct drbd_bitmap *b = mdev->bitmap; | 449 | struct drbd_bitmap *b = mdev->bitmap; |
447 | unsigned long bits, words, owords, obits, *p_addr, *bm; | 450 | unsigned long bits, words, owords, obits, *p_addr, *bm; |
@@ -516,7 +519,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) | |||
516 | obits = b->bm_bits; | 519 | obits = b->bm_bits; |
517 | 520 | ||
518 | growing = bits > obits; | 521 | growing = bits > obits; |
519 | if (opages) | 522 | if (opages && growing && set_new_bits) |
520 | bm_set_surplus(b); | 523 | bm_set_surplus(b); |
521 | 524 | ||
522 | b->bm_pages = npages; | 525 | b->bm_pages = npages; |
@@ -526,8 +529,12 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) | |||
526 | b->bm_dev_capacity = capacity; | 529 | b->bm_dev_capacity = capacity; |
527 | 530 | ||
528 | if (growing) { | 531 | if (growing) { |
529 | bm_memset(b, owords, 0xff, words-owords); | 532 | if (set_new_bits) { |
530 | b->bm_set += bits - obits; | 533 | bm_memset(b, owords, 0xff, words-owords); |
534 | b->bm_set += bits - obits; | ||
535 | } else | ||
536 | bm_memset(b, owords, 0x00, words-owords); | ||
537 | |||
531 | } | 538 | } |
532 | 539 | ||
533 | if (want < have) { | 540 | if (want < have) { |
@@ -773,7 +780,7 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int | |||
773 | /* nothing to do, on disk == in memory */ | 780 | /* nothing to do, on disk == in memory */ |
774 | # define bm_cpu_to_lel(x) ((void)0) | 781 | # define bm_cpu_to_lel(x) ((void)0) |
775 | # else | 782 | # else |
776 | void bm_cpu_to_lel(struct drbd_bitmap *b) | 783 | static void bm_cpu_to_lel(struct drbd_bitmap *b) |
777 | { | 784 | { |
778 | /* need to cpu_to_lel all the pages ... | 785 | /* need to cpu_to_lel all the pages ... |
779 | * this may be optimized by using | 786 | * this may be optimized by using |
@@ -1015,7 +1022,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f | |||
1015 | * wants bitnr, not sector. | 1022 | * wants bitnr, not sector. |
1016 | * expected to be called for only a few bits (e - s about BITS_PER_LONG). | 1023 | * expected to be called for only a few bits (e - s about BITS_PER_LONG). |
1017 | * Must hold bitmap lock already. */ | 1024 | * Must hold bitmap lock already. */ |
1018 | int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | 1025 | static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, |
1019 | unsigned long e, int val, const enum km_type km) | 1026 | unsigned long e, int val, const enum km_type km) |
1020 | { | 1027 | { |
1021 | struct drbd_bitmap *b = mdev->bitmap; | 1028 | struct drbd_bitmap *b = mdev->bitmap; |
@@ -1053,7 +1060,7 @@ int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1053 | * for val != 0, we change 0 -> 1, return code positive | 1060 | * for val != 0, we change 0 -> 1, return code positive |
1054 | * for val == 0, we change 1 -> 0, return code negative | 1061 | * for val == 0, we change 1 -> 0, return code negative |
1055 | * wants bitnr, not sector */ | 1062 | * wants bitnr, not sector */ |
1056 | int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | 1063 | static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, |
1057 | const unsigned long e, int val) | 1064 | const unsigned long e, int val) |
1058 | { | 1065 | { |
1059 | unsigned long flags; | 1066 | unsigned long flags; |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e5e86a781820..e9654c8d5b62 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -132,6 +132,7 @@ enum { | |||
132 | DRBD_FAULT_DT_RA = 6, /* data read ahead */ | 132 | DRBD_FAULT_DT_RA = 6, /* data read ahead */ |
133 | DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ | 133 | DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ |
134 | DRBD_FAULT_AL_EE = 8, /* alloc ee */ | 134 | DRBD_FAULT_AL_EE = 8, /* alloc ee */ |
135 | DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */ | ||
135 | 136 | ||
136 | DRBD_FAULT_MAX, | 137 | DRBD_FAULT_MAX, |
137 | }; | 138 | }; |
@@ -208,8 +209,11 @@ enum drbd_packets { | |||
208 | P_RS_IS_IN_SYNC = 0x22, /* meta socket */ | 209 | P_RS_IS_IN_SYNC = 0x22, /* meta socket */ |
209 | P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ | 210 | P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ |
210 | P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ | 211 | P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ |
212 | /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ | ||
213 | /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ | ||
214 | P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ | ||
211 | 215 | ||
212 | P_MAX_CMD = 0x25, | 216 | P_MAX_CMD = 0x28, |
213 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ | 217 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ |
214 | P_MAX_OPT_CMD = 0x101, | 218 | P_MAX_OPT_CMD = 0x101, |
215 | 219 | ||
@@ -264,6 +268,7 @@ static inline const char *cmdname(enum drbd_packets cmd) | |||
264 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | 268 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", |
265 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | 269 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", |
266 | [P_COMPRESSED_BITMAP] = "CBitmap", | 270 | [P_COMPRESSED_BITMAP] = "CBitmap", |
271 | [P_DELAY_PROBE] = "DelayProbe", | ||
267 | [P_MAX_CMD] = NULL, | 272 | [P_MAX_CMD] = NULL, |
268 | }; | 273 | }; |
269 | 274 | ||
@@ -481,7 +486,8 @@ struct p_sizes { | |||
481 | u64 u_size; /* user requested size */ | 486 | u64 u_size; /* user requested size */ |
482 | u64 c_size; /* current exported size */ | 487 | u64 c_size; /* current exported size */ |
483 | u32 max_segment_size; /* Maximal size of a BIO */ | 488 | u32 max_segment_size; /* Maximal size of a BIO */ |
484 | u32 queue_order_type; | 489 | u16 queue_order_type; /* not yet implemented in DRBD*/ |
490 | u16 dds_flags; /* use enum dds_flags here. */ | ||
485 | } __packed; | 491 | } __packed; |
486 | 492 | ||
487 | struct p_state { | 493 | struct p_state { |
@@ -538,6 +544,18 @@ struct p_compressed_bm { | |||
538 | u8 code[0]; | 544 | u8 code[0]; |
539 | } __packed; | 545 | } __packed; |
540 | 546 | ||
547 | struct p_delay_probe { | ||
548 | struct p_header head; | ||
549 | u32 seq_num; /* sequence number to match the two probe packets */ | ||
550 | u32 offset; /* usecs the probe got sent after the reference time point */ | ||
551 | } __packed; | ||
552 | |||
553 | struct delay_probe { | ||
554 | struct list_head list; | ||
555 | unsigned int seq_num; | ||
556 | struct timeval time; | ||
557 | }; | ||
558 | |||
541 | /* DCBP: Drbd Compressed Bitmap Packet ... */ | 559 | /* DCBP: Drbd Compressed Bitmap Packet ... */ |
542 | static inline enum drbd_bitmap_code | 560 | static inline enum drbd_bitmap_code |
543 | DCBP_get_code(struct p_compressed_bm *p) | 561 | DCBP_get_code(struct p_compressed_bm *p) |
@@ -722,22 +740,6 @@ enum epoch_event { | |||
722 | EV_CLEANUP = 32, /* used as flag */ | 740 | EV_CLEANUP = 32, /* used as flag */ |
723 | }; | 741 | }; |
724 | 742 | ||
725 | struct drbd_epoch_entry { | ||
726 | struct drbd_work w; | ||
727 | struct drbd_conf *mdev; | ||
728 | struct bio *private_bio; | ||
729 | struct hlist_node colision; | ||
730 | sector_t sector; | ||
731 | unsigned int size; | ||
732 | struct drbd_epoch *epoch; | ||
733 | |||
734 | /* up to here, the struct layout is identical to drbd_request; | ||
735 | * we might be able to use that to our advantage... */ | ||
736 | |||
737 | unsigned int flags; | ||
738 | u64 block_id; | ||
739 | }; | ||
740 | |||
741 | struct drbd_wq_barrier { | 743 | struct drbd_wq_barrier { |
742 | struct drbd_work w; | 744 | struct drbd_work w; |
743 | struct completion done; | 745 | struct completion done; |
@@ -748,17 +750,49 @@ struct digest_info { | |||
748 | void *digest; | 750 | void *digest; |
749 | }; | 751 | }; |
750 | 752 | ||
751 | /* ee flag bits */ | 753 | struct drbd_epoch_entry { |
754 | struct drbd_work w; | ||
755 | struct hlist_node colision; | ||
756 | struct drbd_epoch *epoch; | ||
757 | struct drbd_conf *mdev; | ||
758 | struct page *pages; | ||
759 | atomic_t pending_bios; | ||
760 | unsigned int size; | ||
761 | /* see comments on ee flag bits below */ | ||
762 | unsigned long flags; | ||
763 | sector_t sector; | ||
764 | u64 block_id; | ||
765 | }; | ||
766 | |||
767 | /* ee flag bits. | ||
768 | * While corresponding bios are in flight, the only modification will be | ||
769 | * set_bit WAS_ERROR, which has to be atomic. | ||
770 | * If no bios are in flight yet, or all have been completed, | ||
771 | * non-atomic modification to ee->flags is ok. | ||
772 | */ | ||
752 | enum { | 773 | enum { |
753 | __EE_CALL_AL_COMPLETE_IO, | 774 | __EE_CALL_AL_COMPLETE_IO, |
754 | __EE_CONFLICT_PENDING, | ||
755 | __EE_MAY_SET_IN_SYNC, | 775 | __EE_MAY_SET_IN_SYNC, |
776 | |||
777 | /* This epoch entry closes an epoch using a barrier. | ||
778 | * On sucessful completion, the epoch is released, | ||
779 | * and the P_BARRIER_ACK send. */ | ||
756 | __EE_IS_BARRIER, | 780 | __EE_IS_BARRIER, |
781 | |||
782 | /* In case a barrier failed, | ||
783 | * we need to resubmit without the barrier flag. */ | ||
784 | __EE_RESUBMITTED, | ||
785 | |||
786 | /* we may have several bios per epoch entry. | ||
787 | * if any of those fail, we set this flag atomically | ||
788 | * from the endio callback */ | ||
789 | __EE_WAS_ERROR, | ||
757 | }; | 790 | }; |
758 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) | 791 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) |
759 | #define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) | ||
760 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) | 792 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) |
761 | #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) | 793 | #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) |
794 | #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) | ||
795 | #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) | ||
762 | 796 | ||
763 | /* global flag bits */ | 797 | /* global flag bits */ |
764 | enum { | 798 | enum { |
@@ -908,9 +942,12 @@ struct drbd_conf { | |||
908 | unsigned int ko_count; | 942 | unsigned int ko_count; |
909 | struct drbd_work resync_work, | 943 | struct drbd_work resync_work, |
910 | unplug_work, | 944 | unplug_work, |
911 | md_sync_work; | 945 | md_sync_work, |
946 | delay_probe_work, | ||
947 | uuid_work; | ||
912 | struct timer_list resync_timer; | 948 | struct timer_list resync_timer; |
913 | struct timer_list md_sync_timer; | 949 | struct timer_list md_sync_timer; |
950 | struct timer_list delay_probe_timer; | ||
914 | 951 | ||
915 | /* Used after attach while negotiating new disk state. */ | 952 | /* Used after attach while negotiating new disk state. */ |
916 | union drbd_state new_state_tmp; | 953 | union drbd_state new_state_tmp; |
@@ -1026,6 +1063,13 @@ struct drbd_conf { | |||
1026 | u64 ed_uuid; /* UUID of the exposed data */ | 1063 | u64 ed_uuid; /* UUID of the exposed data */ |
1027 | struct mutex state_mutex; | 1064 | struct mutex state_mutex; |
1028 | char congestion_reason; /* Why we where congested... */ | 1065 | char congestion_reason; /* Why we where congested... */ |
1066 | struct list_head delay_probes; /* protected by peer_seq_lock */ | ||
1067 | int data_delay; /* Delay of packets on the data-sock behind meta-sock */ | ||
1068 | unsigned int delay_seq; /* To generate sequence numbers of delay probes */ | ||
1069 | struct timeval dps_time; /* delay-probes-start-time */ | ||
1070 | unsigned int dp_volume_last; /* send_cnt of last delay probe */ | ||
1071 | int c_sync_rate; /* current resync rate after delay_probe magic */ | ||
1072 | atomic_t new_c_uuid; | ||
1029 | }; | 1073 | }; |
1030 | 1074 | ||
1031 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | 1075 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) |
@@ -1081,6 +1125,11 @@ enum chg_state_flags { | |||
1081 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | 1125 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, |
1082 | }; | 1126 | }; |
1083 | 1127 | ||
1128 | enum dds_flags { | ||
1129 | DDSF_FORCED = 1, | ||
1130 | DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ | ||
1131 | }; | ||
1132 | |||
1084 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); | 1133 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); |
1085 | extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | 1134 | extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, |
1086 | union drbd_state mask, union drbd_state val); | 1135 | union drbd_state mask, union drbd_state val); |
@@ -1113,7 +1162,7 @@ extern int drbd_send_protocol(struct drbd_conf *mdev); | |||
1113 | extern int drbd_send_uuids(struct drbd_conf *mdev); | 1162 | extern int drbd_send_uuids(struct drbd_conf *mdev); |
1114 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); | 1163 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); |
1115 | extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); | 1164 | extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); |
1116 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); | 1165 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); |
1117 | extern int _drbd_send_state(struct drbd_conf *mdev); | 1166 | extern int _drbd_send_state(struct drbd_conf *mdev); |
1118 | extern int drbd_send_state(struct drbd_conf *mdev); | 1167 | extern int drbd_send_state(struct drbd_conf *mdev); |
1119 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | 1168 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, |
@@ -1311,7 +1360,7 @@ struct bm_extent { | |||
1311 | #define APP_R_HSIZE 15 | 1360 | #define APP_R_HSIZE 15 |
1312 | 1361 | ||
1313 | extern int drbd_bm_init(struct drbd_conf *mdev); | 1362 | extern int drbd_bm_init(struct drbd_conf *mdev); |
1314 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); | 1363 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); |
1315 | extern void drbd_bm_cleanup(struct drbd_conf *mdev); | 1364 | extern void drbd_bm_cleanup(struct drbd_conf *mdev); |
1316 | extern void drbd_bm_set_all(struct drbd_conf *mdev); | 1365 | extern void drbd_bm_set_all(struct drbd_conf *mdev); |
1317 | extern void drbd_bm_clear_all(struct drbd_conf *mdev); | 1366 | extern void drbd_bm_clear_all(struct drbd_conf *mdev); |
@@ -1383,7 +1432,7 @@ extern void drbd_resume_io(struct drbd_conf *mdev); | |||
1383 | extern char *ppsize(char *buf, unsigned long long size); | 1432 | extern char *ppsize(char *buf, unsigned long long size); |
1384 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); | 1433 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); |
1385 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | 1434 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; |
1386 | extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local); | 1435 | extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); |
1387 | extern void resync_after_online_grow(struct drbd_conf *); | 1436 | extern void resync_after_online_grow(struct drbd_conf *); |
1388 | extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); | 1437 | extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); |
1389 | extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, | 1438 | extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, |
@@ -1414,7 +1463,8 @@ static inline void ov_oos_print(struct drbd_conf *mdev) | |||
1414 | } | 1463 | } |
1415 | 1464 | ||
1416 | 1465 | ||
1417 | extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); | 1466 | extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); |
1467 | extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *); | ||
1418 | /* worker callbacks */ | 1468 | /* worker callbacks */ |
1419 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); | 1469 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); |
1420 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); | 1470 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); |
@@ -1438,6 +1488,8 @@ extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); | |||
1438 | extern void resync_timer_fn(unsigned long data); | 1488 | extern void resync_timer_fn(unsigned long data); |
1439 | 1489 | ||
1440 | /* drbd_receiver.c */ | 1490 | /* drbd_receiver.c */ |
1491 | extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | ||
1492 | const unsigned rw, const int fault_type); | ||
1441 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); | 1493 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); |
1442 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | 1494 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, |
1443 | u64 id, | 1495 | u64 id, |
@@ -1593,6 +1645,41 @@ void drbd_bcast_ee(struct drbd_conf *mdev, | |||
1593 | * inline helper functions | 1645 | * inline helper functions |
1594 | *************************/ | 1646 | *************************/ |
1595 | 1647 | ||
1648 | /* see also page_chain_add and friends in drbd_receiver.c */ | ||
1649 | static inline struct page *page_chain_next(struct page *page) | ||
1650 | { | ||
1651 | return (struct page *)page_private(page); | ||
1652 | } | ||
1653 | #define page_chain_for_each(page) \ | ||
1654 | for (; page && ({ prefetch(page_chain_next(page)); 1; }); \ | ||
1655 | page = page_chain_next(page)) | ||
1656 | #define page_chain_for_each_safe(page, n) \ | ||
1657 | for (; page && ({ n = page_chain_next(page); 1; }); page = n) | ||
1658 | |||
1659 | static inline int drbd_bio_has_active_page(struct bio *bio) | ||
1660 | { | ||
1661 | struct bio_vec *bvec; | ||
1662 | int i; | ||
1663 | |||
1664 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
1665 | if (page_count(bvec->bv_page) > 1) | ||
1666 | return 1; | ||
1667 | } | ||
1668 | |||
1669 | return 0; | ||
1670 | } | ||
1671 | |||
1672 | static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) | ||
1673 | { | ||
1674 | struct page *page = e->pages; | ||
1675 | page_chain_for_each(page) { | ||
1676 | if (page_count(page) > 1) | ||
1677 | return 1; | ||
1678 | } | ||
1679 | return 0; | ||
1680 | } | ||
1681 | |||
1682 | |||
1596 | static inline void drbd_state_lock(struct drbd_conf *mdev) | 1683 | static inline void drbd_state_lock(struct drbd_conf *mdev) |
1597 | { | 1684 | { |
1598 | wait_event(mdev->misc_wait, | 1685 | wait_event(mdev->misc_wait, |
@@ -2132,13 +2219,15 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) | |||
2132 | return 0; | 2219 | return 0; |
2133 | if (test_bit(BITMAP_IO, &mdev->flags)) | 2220 | if (test_bit(BITMAP_IO, &mdev->flags)) |
2134 | return 0; | 2221 | return 0; |
2222 | if (atomic_read(&mdev->new_c_uuid)) | ||
2223 | return 0; | ||
2135 | return 1; | 2224 | return 1; |
2136 | } | 2225 | } |
2137 | 2226 | ||
2138 | /* I'd like to use wait_event_lock_irq, | 2227 | /* I'd like to use wait_event_lock_irq, |
2139 | * but I'm not sure when it got introduced, | 2228 | * but I'm not sure when it got introduced, |
2140 | * and not sure when it has 3 or 4 arguments */ | 2229 | * and not sure when it has 3 or 4 arguments */ |
2141 | static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) | 2230 | static inline void inc_ap_bio(struct drbd_conf *mdev, int count) |
2142 | { | 2231 | { |
2143 | /* compare with after_state_ch, | 2232 | /* compare with after_state_ch, |
2144 | * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ | 2233 | * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ |
@@ -2152,6 +2241,9 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) | |||
2152 | * to avoid races with the reconnect code, | 2241 | * to avoid races with the reconnect code, |
2153 | * we need to atomic_inc within the spinlock. */ | 2242 | * we need to atomic_inc within the spinlock. */ |
2154 | 2243 | ||
2244 | if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1)) | ||
2245 | drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work); | ||
2246 | |||
2155 | spin_lock_irq(&mdev->req_lock); | 2247 | spin_lock_irq(&mdev->req_lock); |
2156 | while (!__inc_ap_bio_cond(mdev)) { | 2248 | while (!__inc_ap_bio_cond(mdev)) { |
2157 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); | 2249 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); |
@@ -2160,7 +2252,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) | |||
2160 | finish_wait(&mdev->misc_wait, &wait); | 2252 | finish_wait(&mdev->misc_wait, &wait); |
2161 | spin_lock_irq(&mdev->req_lock); | 2253 | spin_lock_irq(&mdev->req_lock); |
2162 | } | 2254 | } |
2163 | atomic_add(one_or_two, &mdev->ap_bio_cnt); | 2255 | atomic_add(count, &mdev->ap_bio_cnt); |
2164 | spin_unlock_irq(&mdev->req_lock); | 2256 | spin_unlock_irq(&mdev->req_lock); |
2165 | } | 2257 | } |
2166 | 2258 | ||
@@ -2251,7 +2343,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) | |||
2251 | if (test_bit(MD_NO_BARRIER, &mdev->flags)) | 2343 | if (test_bit(MD_NO_BARRIER, &mdev->flags)) |
2252 | return; | 2344 | return; |
2253 | 2345 | ||
2254 | r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); | 2346 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, |
2347 | BLKDEV_IFL_WAIT); | ||
2255 | if (r) { | 2348 | if (r) { |
2256 | set_bit(MD_NO_BARRIER, &mdev->flags); | 2349 | set_bit(MD_NO_BARRIER, &mdev->flags); |
2257 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | 2350 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 93d1f9b469d4..be2d2da9cdba 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -684,6 +684,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | |||
684 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | 684 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) |
685 | rv = SS_NO_REMOTE_DISK; | 685 | rv = SS_NO_REMOTE_DISK; |
686 | 686 | ||
687 | else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
688 | rv = SS_NO_UP_TO_DATE_DISK; | ||
689 | |||
687 | else if ((ns.conn == C_CONNECTED || | 690 | else if ((ns.conn == C_CONNECTED || |
688 | ns.conn == C_WF_BITMAP_S || | 691 | ns.conn == C_WF_BITMAP_S || |
689 | ns.conn == C_SYNC_SOURCE || | 692 | ns.conn == C_SYNC_SOURCE || |
@@ -840,7 +843,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state | |||
840 | break; | 843 | break; |
841 | case C_WF_BITMAP_S: | 844 | case C_WF_BITMAP_S: |
842 | case C_PAUSED_SYNC_S: | 845 | case C_PAUSED_SYNC_S: |
843 | ns.pdsk = D_OUTDATED; | 846 | /* remap any consistent state to D_OUTDATED, |
847 | * but disallow "upgrade" of not even consistent states. | ||
848 | */ | ||
849 | ns.pdsk = | ||
850 | (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED) | ||
851 | ? os.pdsk : D_OUTDATED; | ||
844 | break; | 852 | break; |
845 | case C_SYNC_SOURCE: | 853 | case C_SYNC_SOURCE: |
846 | ns.pdsk = D_INCONSISTENT; | 854 | ns.pdsk = D_INCONSISTENT; |
@@ -1205,21 +1213,20 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1205 | && (ns.pdsk < D_INCONSISTENT || | 1213 | && (ns.pdsk < D_INCONSISTENT || |
1206 | ns.pdsk == D_UNKNOWN || | 1214 | ns.pdsk == D_UNKNOWN || |
1207 | ns.pdsk == D_OUTDATED)) { | 1215 | ns.pdsk == D_OUTDATED)) { |
1208 | kfree(mdev->p_uuid); | ||
1209 | mdev->p_uuid = NULL; | ||
1210 | if (get_ldev(mdev)) { | 1216 | if (get_ldev(mdev)) { |
1211 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | 1217 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && |
1212 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | 1218 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE && |
1213 | drbd_uuid_new_current(mdev); | 1219 | !atomic_read(&mdev->new_c_uuid)) |
1214 | drbd_send_uuids(mdev); | 1220 | atomic_set(&mdev->new_c_uuid, 2); |
1215 | } | ||
1216 | put_ldev(mdev); | 1221 | put_ldev(mdev); |
1217 | } | 1222 | } |
1218 | } | 1223 | } |
1219 | 1224 | ||
1220 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | 1225 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { |
1221 | if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) | 1226 | /* Diskless peer becomes primary or got connected do diskless, primary peer. */ |
1222 | drbd_uuid_new_current(mdev); | 1227 | if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 && |
1228 | !atomic_read(&mdev->new_c_uuid)) | ||
1229 | atomic_set(&mdev->new_c_uuid, 2); | ||
1223 | 1230 | ||
1224 | /* D_DISKLESS Peer becomes secondary */ | 1231 | /* D_DISKLESS Peer becomes secondary */ |
1225 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | 1232 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) |
@@ -1232,7 +1239,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1232 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | 1239 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { |
1233 | kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ | 1240 | kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ |
1234 | mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ | 1241 | mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ |
1235 | drbd_send_sizes(mdev, 0); /* to start sync... */ | 1242 | drbd_send_sizes(mdev, 0, 0); /* to start sync... */ |
1236 | drbd_send_uuids(mdev); | 1243 | drbd_send_uuids(mdev); |
1237 | drbd_send_state(mdev); | 1244 | drbd_send_state(mdev); |
1238 | } | 1245 | } |
@@ -1343,6 +1350,24 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1343 | drbd_md_sync(mdev); | 1350 | drbd_md_sync(mdev); |
1344 | } | 1351 | } |
1345 | 1352 | ||
1353 | static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1354 | { | ||
1355 | if (get_ldev(mdev)) { | ||
1356 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0) { | ||
1357 | drbd_uuid_new_current(mdev); | ||
1358 | if (get_net_conf(mdev)) { | ||
1359 | drbd_send_uuids(mdev); | ||
1360 | put_net_conf(mdev); | ||
1361 | } | ||
1362 | drbd_md_sync(mdev); | ||
1363 | } | ||
1364 | put_ldev(mdev); | ||
1365 | } | ||
1366 | atomic_dec(&mdev->new_c_uuid); | ||
1367 | wake_up(&mdev->misc_wait); | ||
1368 | |||
1369 | return 1; | ||
1370 | } | ||
1346 | 1371 | ||
1347 | static int drbd_thread_setup(void *arg) | 1372 | static int drbd_thread_setup(void *arg) |
1348 | { | 1373 | { |
@@ -1755,7 +1780,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) | |||
1755 | (struct p_header *)&p, sizeof(p)); | 1780 | (struct p_header *)&p, sizeof(p)); |
1756 | } | 1781 | } |
1757 | 1782 | ||
1758 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) | 1783 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) |
1759 | { | 1784 | { |
1760 | struct p_sizes p; | 1785 | struct p_sizes p; |
1761 | sector_t d_size, u_size; | 1786 | sector_t d_size, u_size; |
@@ -1767,7 +1792,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) | |||
1767 | d_size = drbd_get_max_capacity(mdev->ldev); | 1792 | d_size = drbd_get_max_capacity(mdev->ldev); |
1768 | u_size = mdev->ldev->dc.disk_size; | 1793 | u_size = mdev->ldev->dc.disk_size; |
1769 | q_order_type = drbd_queue_order_type(mdev); | 1794 | q_order_type = drbd_queue_order_type(mdev); |
1770 | p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); | ||
1771 | put_ldev(mdev); | 1795 | put_ldev(mdev); |
1772 | } else { | 1796 | } else { |
1773 | d_size = 0; | 1797 | d_size = 0; |
@@ -1779,7 +1803,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) | |||
1779 | p.u_size = cpu_to_be64(u_size); | 1803 | p.u_size = cpu_to_be64(u_size); |
1780 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); | 1804 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); |
1781 | p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); | 1805 | p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); |
1782 | p.queue_order_type = cpu_to_be32(q_order_type); | 1806 | p.queue_order_type = cpu_to_be16(q_order_type); |
1807 | p.dds_flags = cpu_to_be16(flags); | ||
1783 | 1808 | ||
1784 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, | 1809 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, |
1785 | (struct p_header *)&p, sizeof(p)); | 1810 | (struct p_header *)&p, sizeof(p)); |
@@ -2180,6 +2205,43 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) | |||
2180 | return ok; | 2205 | return ok; |
2181 | } | 2206 | } |
2182 | 2207 | ||
2208 | static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds) | ||
2209 | { | ||
2210 | struct p_delay_probe dp; | ||
2211 | int offset, ok = 0; | ||
2212 | struct timeval now; | ||
2213 | |||
2214 | mutex_lock(&ds->mutex); | ||
2215 | if (likely(ds->socket)) { | ||
2216 | do_gettimeofday(&now); | ||
2217 | offset = now.tv_usec - mdev->dps_time.tv_usec + | ||
2218 | (now.tv_sec - mdev->dps_time.tv_sec) * 1000000; | ||
2219 | dp.seq_num = cpu_to_be32(mdev->delay_seq); | ||
2220 | dp.offset = cpu_to_be32(offset); | ||
2221 | |||
2222 | ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE, | ||
2223 | (struct p_header *)&dp, sizeof(dp), 0); | ||
2224 | } | ||
2225 | mutex_unlock(&ds->mutex); | ||
2226 | |||
2227 | return ok; | ||
2228 | } | ||
2229 | |||
2230 | static int drbd_send_delay_probes(struct drbd_conf *mdev) | ||
2231 | { | ||
2232 | int ok; | ||
2233 | |||
2234 | mdev->delay_seq++; | ||
2235 | do_gettimeofday(&mdev->dps_time); | ||
2236 | ok = drbd_send_delay_probe(mdev, &mdev->meta); | ||
2237 | ok = ok && drbd_send_delay_probe(mdev, &mdev->data); | ||
2238 | |||
2239 | mdev->dp_volume_last = mdev->send_cnt; | ||
2240 | mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10); | ||
2241 | |||
2242 | return ok; | ||
2243 | } | ||
2244 | |||
2183 | /* called on sndtimeo | 2245 | /* called on sndtimeo |
2184 | * returns FALSE if we should retry, | 2246 | * returns FALSE if we should retry, |
2185 | * TRUE if we think connection is dead | 2247 | * TRUE if we think connection is dead |
@@ -2309,6 +2371,44 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | |||
2309 | return 1; | 2371 | return 1; |
2310 | } | 2372 | } |
2311 | 2373 | ||
2374 | static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | ||
2375 | { | ||
2376 | struct page *page = e->pages; | ||
2377 | unsigned len = e->size; | ||
2378 | page_chain_for_each(page) { | ||
2379 | unsigned l = min_t(unsigned, len, PAGE_SIZE); | ||
2380 | if (!_drbd_send_page(mdev, page, 0, l)) | ||
2381 | return 0; | ||
2382 | len -= l; | ||
2383 | } | ||
2384 | return 1; | ||
2385 | } | ||
2386 | |||
2387 | static void consider_delay_probes(struct drbd_conf *mdev) | ||
2388 | { | ||
2389 | if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93) | ||
2390 | return; | ||
2391 | |||
2392 | if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt) | ||
2393 | drbd_send_delay_probes(mdev); | ||
2394 | } | ||
2395 | |||
2396 | static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
2397 | { | ||
2398 | if (!cancel && mdev->state.conn == C_SYNC_SOURCE) | ||
2399 | drbd_send_delay_probes(mdev); | ||
2400 | |||
2401 | return 1; | ||
2402 | } | ||
2403 | |||
2404 | static void delay_probe_timer_fn(unsigned long data) | ||
2405 | { | ||
2406 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
2407 | |||
2408 | if (list_empty(&mdev->delay_probe_work.list)) | ||
2409 | drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work); | ||
2410 | } | ||
2411 | |||
2312 | /* Used to send write requests | 2412 | /* Used to send write requests |
2313 | * R_PRIMARY -> Peer (P_DATA) | 2413 | * R_PRIMARY -> Peer (P_DATA) |
2314 | */ | 2414 | */ |
@@ -2360,7 +2460,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | |||
2360 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); | 2460 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); |
2361 | if (ok && dgs) { | 2461 | if (ok && dgs) { |
2362 | dgb = mdev->int_dig_out; | 2462 | dgb = mdev->int_dig_out; |
2363 | drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); | 2463 | drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); |
2364 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | 2464 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); |
2365 | } | 2465 | } |
2366 | if (ok) { | 2466 | if (ok) { |
@@ -2371,6 +2471,10 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | |||
2371 | } | 2471 | } |
2372 | 2472 | ||
2373 | drbd_put_data_sock(mdev); | 2473 | drbd_put_data_sock(mdev); |
2474 | |||
2475 | if (ok) | ||
2476 | consider_delay_probes(mdev); | ||
2477 | |||
2374 | return ok; | 2478 | return ok; |
2375 | } | 2479 | } |
2376 | 2480 | ||
@@ -2409,13 +2513,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | |||
2409 | sizeof(p), MSG_MORE); | 2513 | sizeof(p), MSG_MORE); |
2410 | if (ok && dgs) { | 2514 | if (ok && dgs) { |
2411 | dgb = mdev->int_dig_out; | 2515 | dgb = mdev->int_dig_out; |
2412 | drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); | 2516 | drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); |
2413 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | 2517 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); |
2414 | } | 2518 | } |
2415 | if (ok) | 2519 | if (ok) |
2416 | ok = _drbd_send_zc_bio(mdev, e->private_bio); | 2520 | ok = _drbd_send_zc_ee(mdev, e); |
2417 | 2521 | ||
2418 | drbd_put_data_sock(mdev); | 2522 | drbd_put_data_sock(mdev); |
2523 | |||
2524 | if (ok) | ||
2525 | consider_delay_probes(mdev); | ||
2526 | |||
2419 | return ok; | 2527 | return ok; |
2420 | } | 2528 | } |
2421 | 2529 | ||
@@ -2600,6 +2708,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
2600 | atomic_set(&mdev->net_cnt, 0); | 2708 | atomic_set(&mdev->net_cnt, 0); |
2601 | atomic_set(&mdev->packet_seq, 0); | 2709 | atomic_set(&mdev->packet_seq, 0); |
2602 | atomic_set(&mdev->pp_in_use, 0); | 2710 | atomic_set(&mdev->pp_in_use, 0); |
2711 | atomic_set(&mdev->new_c_uuid, 0); | ||
2603 | 2712 | ||
2604 | mutex_init(&mdev->md_io_mutex); | 2713 | mutex_init(&mdev->md_io_mutex); |
2605 | mutex_init(&mdev->data.mutex); | 2714 | mutex_init(&mdev->data.mutex); |
@@ -2628,16 +2737,26 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
2628 | INIT_LIST_HEAD(&mdev->unplug_work.list); | 2737 | INIT_LIST_HEAD(&mdev->unplug_work.list); |
2629 | INIT_LIST_HEAD(&mdev->md_sync_work.list); | 2738 | INIT_LIST_HEAD(&mdev->md_sync_work.list); |
2630 | INIT_LIST_HEAD(&mdev->bm_io_work.w.list); | 2739 | INIT_LIST_HEAD(&mdev->bm_io_work.w.list); |
2740 | INIT_LIST_HEAD(&mdev->delay_probes); | ||
2741 | INIT_LIST_HEAD(&mdev->delay_probe_work.list); | ||
2742 | INIT_LIST_HEAD(&mdev->uuid_work.list); | ||
2743 | |||
2631 | mdev->resync_work.cb = w_resync_inactive; | 2744 | mdev->resync_work.cb = w_resync_inactive; |
2632 | mdev->unplug_work.cb = w_send_write_hint; | 2745 | mdev->unplug_work.cb = w_send_write_hint; |
2633 | mdev->md_sync_work.cb = w_md_sync; | 2746 | mdev->md_sync_work.cb = w_md_sync; |
2634 | mdev->bm_io_work.w.cb = w_bitmap_io; | 2747 | mdev->bm_io_work.w.cb = w_bitmap_io; |
2748 | mdev->delay_probe_work.cb = w_delay_probes; | ||
2749 | mdev->uuid_work.cb = w_new_current_uuid; | ||
2635 | init_timer(&mdev->resync_timer); | 2750 | init_timer(&mdev->resync_timer); |
2636 | init_timer(&mdev->md_sync_timer); | 2751 | init_timer(&mdev->md_sync_timer); |
2752 | init_timer(&mdev->delay_probe_timer); | ||
2637 | mdev->resync_timer.function = resync_timer_fn; | 2753 | mdev->resync_timer.function = resync_timer_fn; |
2638 | mdev->resync_timer.data = (unsigned long) mdev; | 2754 | mdev->resync_timer.data = (unsigned long) mdev; |
2639 | mdev->md_sync_timer.function = md_sync_timer_fn; | 2755 | mdev->md_sync_timer.function = md_sync_timer_fn; |
2640 | mdev->md_sync_timer.data = (unsigned long) mdev; | 2756 | mdev->md_sync_timer.data = (unsigned long) mdev; |
2757 | mdev->delay_probe_timer.function = delay_probe_timer_fn; | ||
2758 | mdev->delay_probe_timer.data = (unsigned long) mdev; | ||
2759 | |||
2641 | 2760 | ||
2642 | init_waitqueue_head(&mdev->misc_wait); | 2761 | init_waitqueue_head(&mdev->misc_wait); |
2643 | init_waitqueue_head(&mdev->state_wait); | 2762 | init_waitqueue_head(&mdev->state_wait); |
@@ -2680,7 +2799,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
2680 | drbd_set_my_capacity(mdev, 0); | 2799 | drbd_set_my_capacity(mdev, 0); |
2681 | if (mdev->bitmap) { | 2800 | if (mdev->bitmap) { |
2682 | /* maybe never allocated. */ | 2801 | /* maybe never allocated. */ |
2683 | drbd_bm_resize(mdev, 0); | 2802 | drbd_bm_resize(mdev, 0, 1); |
2684 | drbd_bm_cleanup(mdev); | 2803 | drbd_bm_cleanup(mdev); |
2685 | } | 2804 | } |
2686 | 2805 | ||
@@ -3129,7 +3248,7 @@ int __init drbd_init(void) | |||
3129 | if (err) | 3248 | if (err) |
3130 | goto Enomem; | 3249 | goto Enomem; |
3131 | 3250 | ||
3132 | drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); | 3251 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); |
3133 | if (!drbd_proc) { | 3252 | if (!drbd_proc) { |
3134 | printk(KERN_ERR "drbd: unable to register proc file\n"); | 3253 | printk(KERN_ERR "drbd: unable to register proc file\n"); |
3135 | goto Enomem; | 3254 | goto Enomem; |
@@ -3660,7 +3779,8 @@ _drbd_fault_str(unsigned int type) { | |||
3660 | [DRBD_FAULT_DT_RD] = "Data read", | 3779 | [DRBD_FAULT_DT_RD] = "Data read", |
3661 | [DRBD_FAULT_DT_RA] = "Data read ahead", | 3780 | [DRBD_FAULT_DT_RA] = "Data read ahead", |
3662 | [DRBD_FAULT_BM_ALLOC] = "BM allocation", | 3781 | [DRBD_FAULT_BM_ALLOC] = "BM allocation", |
3663 | [DRBD_FAULT_AL_EE] = "EE allocation" | 3782 | [DRBD_FAULT_AL_EE] = "EE allocation", |
3783 | [DRBD_FAULT_RECEIVE] = "receive data corruption", | ||
3664 | }; | 3784 | }; |
3665 | 3785 | ||
3666 | return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; | 3786 | return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6429d2b19e06..632e3245d1bb 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
510 | * Returns 0 on success, negative return values indicate errors. | 510 | * Returns 0 on success, negative return values indicate errors. |
511 | * You should call drbd_md_sync() after calling this function. | 511 | * You should call drbd_md_sync() after calling this function. |
512 | */ | 512 | */ |
513 | enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local) | 513 | enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) |
514 | { | 514 | { |
515 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 515 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
516 | sector_t la_size; | 516 | sector_t la_size; |
@@ -541,12 +541,12 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force | |||
541 | /* TODO: should only be some assert here, not (re)init... */ | 541 | /* TODO: should only be some assert here, not (re)init... */ |
542 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 542 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
543 | 543 | ||
544 | size = drbd_new_dev_size(mdev, mdev->ldev, force); | 544 | size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED); |
545 | 545 | ||
546 | if (drbd_get_capacity(mdev->this_bdev) != size || | 546 | if (drbd_get_capacity(mdev->this_bdev) != size || |
547 | drbd_bm_capacity(mdev) != size) { | 547 | drbd_bm_capacity(mdev) != size) { |
548 | int err; | 548 | int err; |
549 | err = drbd_bm_resize(mdev, size); | 549 | err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC)); |
550 | if (unlikely(err)) { | 550 | if (unlikely(err)) { |
551 | /* currently there is only one error: ENOMEM! */ | 551 | /* currently there is only one error: ENOMEM! */ |
552 | size = drbd_bm_capacity(mdev)>>1; | 552 | size = drbd_bm_capacity(mdev)>>1; |
@@ -704,9 +704,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu | |||
704 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; | 704 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; |
705 | int max_segments = mdev->ldev->dc.max_bio_bvecs; | 705 | int max_segments = mdev->ldev->dc.max_bio_bvecs; |
706 | 706 | ||
707 | if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) | ||
708 | max_seg_s = PAGE_SIZE; | ||
709 | |||
710 | max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); | 707 | max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); |
711 | 708 | ||
712 | blk_queue_max_hw_sectors(q, max_seg_s >> 9); | 709 | blk_queue_max_hw_sectors(q, max_seg_s >> 9); |
@@ -1199,13 +1196,12 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1199 | } | 1196 | } |
1200 | 1197 | ||
1201 | /* allocation not in the IO path, cqueue thread context */ | 1198 | /* allocation not in the IO path, cqueue thread context */ |
1202 | new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); | 1199 | new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); |
1203 | if (!new_conf) { | 1200 | if (!new_conf) { |
1204 | retcode = ERR_NOMEM; | 1201 | retcode = ERR_NOMEM; |
1205 | goto fail; | 1202 | goto fail; |
1206 | } | 1203 | } |
1207 | 1204 | ||
1208 | memset(new_conf, 0, sizeof(struct net_conf)); | ||
1209 | new_conf->timeout = DRBD_TIMEOUT_DEF; | 1205 | new_conf->timeout = DRBD_TIMEOUT_DEF; |
1210 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; | 1206 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; |
1211 | new_conf->ping_int = DRBD_PING_INT_DEF; | 1207 | new_conf->ping_int = DRBD_PING_INT_DEF; |
@@ -1477,8 +1473,8 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1477 | { | 1473 | { |
1478 | struct resize rs; | 1474 | struct resize rs; |
1479 | int retcode = NO_ERROR; | 1475 | int retcode = NO_ERROR; |
1480 | int ldsc = 0; /* local disk size changed */ | ||
1481 | enum determine_dev_size dd; | 1476 | enum determine_dev_size dd; |
1477 | enum dds_flags ddsf; | ||
1482 | 1478 | ||
1483 | memset(&rs, 0, sizeof(struct resize)); | 1479 | memset(&rs, 0, sizeof(struct resize)); |
1484 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { | 1480 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { |
@@ -1502,13 +1498,17 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1502 | goto fail; | 1498 | goto fail; |
1503 | } | 1499 | } |
1504 | 1500 | ||
1505 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | 1501 | if (rs.no_resync && mdev->agreed_pro_version < 93) { |
1506 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | 1502 | retcode = ERR_NEED_APV_93; |
1507 | ldsc = 1; | 1503 | goto fail; |
1508 | } | 1504 | } |
1509 | 1505 | ||
1506 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) | ||
1507 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | ||
1508 | |||
1510 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; | 1509 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; |
1511 | dd = drbd_determin_dev_size(mdev, rs.resize_force); | 1510 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); |
1511 | dd = drbd_determin_dev_size(mdev, ddsf); | ||
1512 | drbd_md_sync(mdev); | 1512 | drbd_md_sync(mdev); |
1513 | put_ldev(mdev); | 1513 | put_ldev(mdev); |
1514 | if (dd == dev_size_error) { | 1514 | if (dd == dev_size_error) { |
@@ -1516,12 +1516,12 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1516 | goto fail; | 1516 | goto fail; |
1517 | } | 1517 | } |
1518 | 1518 | ||
1519 | if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { | 1519 | if (mdev->state.conn == C_CONNECTED) { |
1520 | if (dd == grew) | 1520 | if (dd == grew) |
1521 | set_bit(RESIZE_PENDING, &mdev->flags); | 1521 | set_bit(RESIZE_PENDING, &mdev->flags); |
1522 | 1522 | ||
1523 | drbd_send_uuids(mdev); | 1523 | drbd_send_uuids(mdev); |
1524 | drbd_send_sizes(mdev, 1); | 1524 | drbd_send_sizes(mdev, 1, ddsf); |
1525 | } | 1525 | } |
1526 | 1526 | ||
1527 | fail: | 1527 | fail: |
@@ -1551,6 +1551,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n | |||
1551 | sc.rate = DRBD_RATE_DEF; | 1551 | sc.rate = DRBD_RATE_DEF; |
1552 | sc.after = DRBD_AFTER_DEF; | 1552 | sc.after = DRBD_AFTER_DEF; |
1553 | sc.al_extents = DRBD_AL_EXTENTS_DEF; | 1553 | sc.al_extents = DRBD_AL_EXTENTS_DEF; |
1554 | sc.dp_volume = DRBD_DP_VOLUME_DEF; | ||
1555 | sc.dp_interval = DRBD_DP_INTERVAL_DEF; | ||
1556 | sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF; | ||
1557 | sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF; | ||
1554 | } else | 1558 | } else |
1555 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); | 1559 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); |
1556 | 1560 | ||
@@ -2207,9 +2211,9 @@ void drbd_bcast_ee(struct drbd_conf *mdev, | |||
2207 | { | 2211 | { |
2208 | struct cn_msg *cn_reply; | 2212 | struct cn_msg *cn_reply; |
2209 | struct drbd_nl_cfg_reply *reply; | 2213 | struct drbd_nl_cfg_reply *reply; |
2210 | struct bio_vec *bvec; | ||
2211 | unsigned short *tl; | 2214 | unsigned short *tl; |
2212 | int i; | 2215 | struct page *page; |
2216 | unsigned len; | ||
2213 | 2217 | ||
2214 | if (!e) | 2218 | if (!e) |
2215 | return; | 2219 | return; |
@@ -2247,11 +2251,15 @@ void drbd_bcast_ee(struct drbd_conf *mdev, | |||
2247 | put_unaligned(T_ee_data, tl++); | 2251 | put_unaligned(T_ee_data, tl++); |
2248 | put_unaligned(e->size, tl++); | 2252 | put_unaligned(e->size, tl++); |
2249 | 2253 | ||
2250 | __bio_for_each_segment(bvec, e->private_bio, i, 0) { | 2254 | len = e->size; |
2251 | void *d = kmap(bvec->bv_page); | 2255 | page = e->pages; |
2252 | memcpy(tl, d + bvec->bv_offset, bvec->bv_len); | 2256 | page_chain_for_each(page) { |
2253 | kunmap(bvec->bv_page); | 2257 | void *d = kmap_atomic(page, KM_USER0); |
2254 | tl=(unsigned short*)((char*)tl + bvec->bv_len); | 2258 | unsigned l = min_t(unsigned, len, PAGE_SIZE); |
2259 | memcpy(tl, d, l); | ||
2260 | kunmap_atomic(d, KM_USER0); | ||
2261 | tl = (unsigned short*)((char*)tl + l); | ||
2262 | len -= l; | ||
2255 | } | 2263 | } |
2256 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2264 | put_unaligned(TT_END, tl++); /* Close the tag list */ |
2257 | 2265 | ||
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index be3374b68460..d0f1767ea4c3 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -73,14 +73,21 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | |||
73 | seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); | 73 | seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); |
74 | /* if more than 1 GB display in MB */ | 74 | /* if more than 1 GB display in MB */ |
75 | if (mdev->rs_total > 0x100000L) | 75 | if (mdev->rs_total > 0x100000L) |
76 | seq_printf(seq, "(%lu/%lu)M\n\t", | 76 | seq_printf(seq, "(%lu/%lu)M", |
77 | (unsigned long) Bit2KB(rs_left >> 10), | 77 | (unsigned long) Bit2KB(rs_left >> 10), |
78 | (unsigned long) Bit2KB(mdev->rs_total >> 10)); | 78 | (unsigned long) Bit2KB(mdev->rs_total >> 10)); |
79 | else | 79 | else |
80 | seq_printf(seq, "(%lu/%lu)K\n\t", | 80 | seq_printf(seq, "(%lu/%lu)K", |
81 | (unsigned long) Bit2KB(rs_left), | 81 | (unsigned long) Bit2KB(rs_left), |
82 | (unsigned long) Bit2KB(mdev->rs_total)); | 82 | (unsigned long) Bit2KB(mdev->rs_total)); |
83 | 83 | ||
84 | if (mdev->state.conn == C_SYNC_TARGET) | ||
85 | seq_printf(seq, " queue_delay: %d.%d ms\n\t", | ||
86 | mdev->data_delay / 1000, | ||
87 | (mdev->data_delay % 1000) / 100); | ||
88 | else if (mdev->state.conn == C_SYNC_SOURCE) | ||
89 | seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq); | ||
90 | |||
84 | /* see drivers/md/md.c | 91 | /* see drivers/md/md.c |
85 | * We do not want to overflow, so the order of operands and | 92 | * We do not want to overflow, so the order of operands and |
86 | * the * 100 / 100 trick are important. We do a +1 to be | 93 | * the * 100 / 100 trick are important. We do a +1 to be |
@@ -128,6 +135,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | |||
128 | else | 135 | else |
129 | seq_printf(seq, " (%ld)", dbdt); | 136 | seq_printf(seq, " (%ld)", dbdt); |
130 | 137 | ||
138 | if (mdev->state.conn == C_SYNC_TARGET) { | ||
139 | if (mdev->c_sync_rate > 1000) | ||
140 | seq_printf(seq, " want: %d,%03d", | ||
141 | mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000); | ||
142 | else | ||
143 | seq_printf(seq, " want: %d", mdev->c_sync_rate); | ||
144 | } | ||
145 | |||
131 | seq_printf(seq, " K/sec\n"); | 146 | seq_printf(seq, " K/sec\n"); |
132 | } | 147 | } |
133 | 148 | ||
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 3f096e7959b4..bc9ab7fb2cc7 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -80,30 +80,128 @@ static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epo | |||
80 | 80 | ||
81 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | 81 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) |
82 | 82 | ||
83 | static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) | 83 | /* |
84 | * some helper functions to deal with single linked page lists, | ||
85 | * page->private being our "next" pointer. | ||
86 | */ | ||
87 | |||
88 | /* If at least n pages are linked at head, get n pages off. | ||
89 | * Otherwise, don't modify head, and return NULL. | ||
90 | * Locking is the responsibility of the caller. | ||
91 | */ | ||
92 | static struct page *page_chain_del(struct page **head, int n) | ||
93 | { | ||
94 | struct page *page; | ||
95 | struct page *tmp; | ||
96 | |||
97 | BUG_ON(!n); | ||
98 | BUG_ON(!head); | ||
99 | |||
100 | page = *head; | ||
101 | |||
102 | if (!page) | ||
103 | return NULL; | ||
104 | |||
105 | while (page) { | ||
106 | tmp = page_chain_next(page); | ||
107 | if (--n == 0) | ||
108 | break; /* found sufficient pages */ | ||
109 | if (tmp == NULL) | ||
110 | /* insufficient pages, don't use any of them. */ | ||
111 | return NULL; | ||
112 | page = tmp; | ||
113 | } | ||
114 | |||
115 | /* add end of list marker for the returned list */ | ||
116 | set_page_private(page, 0); | ||
117 | /* actual return value, and adjustment of head */ | ||
118 | page = *head; | ||
119 | *head = tmp; | ||
120 | return page; | ||
121 | } | ||
122 | |||
123 | /* may be used outside of locks to find the tail of a (usually short) | ||
124 | * "private" page chain, before adding it back to a global chain head | ||
125 | * with page_chain_add() under a spinlock. */ | ||
126 | static struct page *page_chain_tail(struct page *page, int *len) | ||
127 | { | ||
128 | struct page *tmp; | ||
129 | int i = 1; | ||
130 | while ((tmp = page_chain_next(page))) | ||
131 | ++i, page = tmp; | ||
132 | if (len) | ||
133 | *len = i; | ||
134 | return page; | ||
135 | } | ||
136 | |||
137 | static int page_chain_free(struct page *page) | ||
138 | { | ||
139 | struct page *tmp; | ||
140 | int i = 0; | ||
141 | page_chain_for_each_safe(page, tmp) { | ||
142 | put_page(page); | ||
143 | ++i; | ||
144 | } | ||
145 | return i; | ||
146 | } | ||
147 | |||
148 | static void page_chain_add(struct page **head, | ||
149 | struct page *chain_first, struct page *chain_last) | ||
150 | { | ||
151 | #if 1 | ||
152 | struct page *tmp; | ||
153 | tmp = page_chain_tail(chain_first, NULL); | ||
154 | BUG_ON(tmp != chain_last); | ||
155 | #endif | ||
156 | |||
157 | /* add chain to head */ | ||
158 | set_page_private(chain_last, (unsigned long)*head); | ||
159 | *head = chain_first; | ||
160 | } | ||
161 | |||
162 | static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number) | ||
84 | { | 163 | { |
85 | struct page *page = NULL; | 164 | struct page *page = NULL; |
165 | struct page *tmp = NULL; | ||
166 | int i = 0; | ||
86 | 167 | ||
87 | /* Yes, testing drbd_pp_vacant outside the lock is racy. | 168 | /* Yes, testing drbd_pp_vacant outside the lock is racy. |
88 | * So what. It saves a spin_lock. */ | 169 | * So what. It saves a spin_lock. */ |
89 | if (drbd_pp_vacant > 0) { | 170 | if (drbd_pp_vacant >= number) { |
90 | spin_lock(&drbd_pp_lock); | 171 | spin_lock(&drbd_pp_lock); |
91 | page = drbd_pp_pool; | 172 | page = page_chain_del(&drbd_pp_pool, number); |
92 | if (page) { | 173 | if (page) |
93 | drbd_pp_pool = (struct page *)page_private(page); | 174 | drbd_pp_vacant -= number; |
94 | set_page_private(page, 0); /* just to be polite */ | ||
95 | drbd_pp_vacant--; | ||
96 | } | ||
97 | spin_unlock(&drbd_pp_lock); | 175 | spin_unlock(&drbd_pp_lock); |
176 | if (page) | ||
177 | return page; | ||
98 | } | 178 | } |
179 | |||
99 | /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD | 180 | /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD |
100 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 181 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
101 | * which in turn might block on the other node at this very place. */ | 182 | * which in turn might block on the other node at this very place. */ |
102 | if (!page) | 183 | for (i = 0; i < number; i++) { |
103 | page = alloc_page(GFP_TRY); | 184 | tmp = alloc_page(GFP_TRY); |
104 | if (page) | 185 | if (!tmp) |
105 | atomic_inc(&mdev->pp_in_use); | 186 | break; |
106 | return page; | 187 | set_page_private(tmp, (unsigned long)page); |
188 | page = tmp; | ||
189 | } | ||
190 | |||
191 | if (i == number) | ||
192 | return page; | ||
193 | |||
194 | /* Not enough pages immediately available this time. | ||
195 | * No need to jump around here, drbd_pp_alloc will retry this | ||
196 | * function "soon". */ | ||
197 | if (page) { | ||
198 | tmp = page_chain_tail(page, NULL); | ||
199 | spin_lock(&drbd_pp_lock); | ||
200 | page_chain_add(&drbd_pp_pool, page, tmp); | ||
201 | drbd_pp_vacant += i; | ||
202 | spin_unlock(&drbd_pp_lock); | ||
203 | } | ||
204 | return NULL; | ||
107 | } | 205 | } |
108 | 206 | ||
109 | /* kick lower level device, if we have more than (arbitrary number) | 207 | /* kick lower level device, if we have more than (arbitrary number) |
@@ -127,7 +225,7 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed | |||
127 | 225 | ||
128 | list_for_each_safe(le, tle, &mdev->net_ee) { | 226 | list_for_each_safe(le, tle, &mdev->net_ee) { |
129 | e = list_entry(le, struct drbd_epoch_entry, w.list); | 227 | e = list_entry(le, struct drbd_epoch_entry, w.list); |
130 | if (drbd_bio_has_active_page(e->private_bio)) | 228 | if (drbd_ee_has_active_page(e)) |
131 | break; | 229 | break; |
132 | list_move(le, to_be_freed); | 230 | list_move(le, to_be_freed); |
133 | } | 231 | } |
@@ -148,32 +246,34 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | |||
148 | } | 246 | } |
149 | 247 | ||
150 | /** | 248 | /** |
151 | * drbd_pp_alloc() - Returns a page, fails only if a signal comes in | 249 | * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled) |
152 | * @mdev: DRBD device. | 250 | * @mdev: DRBD device. |
153 | * @retry: whether or not to retry allocation forever (or until signalled) | 251 | * @number: number of pages requested |
252 | * @retry: whether to retry, if not enough pages are available right now | ||
253 | * | ||
254 | * Tries to allocate number pages, first from our own page pool, then from | ||
255 | * the kernel, unless this allocation would exceed the max_buffers setting. | ||
256 | * Possibly retry until DRBD frees sufficient pages somewhere else. | ||
154 | * | 257 | * |
155 | * Tries to allocate a page, first from our own page pool, then from the | 258 | * Returns a page chain linked via page->private. |
156 | * kernel, unless this allocation would exceed the max_buffers setting. | ||
157 | * If @retry is non-zero, retry until DRBD frees a page somewhere else. | ||
158 | */ | 259 | */ |
159 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) | 260 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry) |
160 | { | 261 | { |
161 | struct page *page = NULL; | 262 | struct page *page = NULL; |
162 | DEFINE_WAIT(wait); | 263 | DEFINE_WAIT(wait); |
163 | 264 | ||
164 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | 265 | /* Yes, we may run up to @number over max_buffers. If we |
165 | page = drbd_pp_first_page_or_try_alloc(mdev); | 266 | * follow it strictly, the admin will get it wrong anyways. */ |
166 | if (page) | 267 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) |
167 | return page; | 268 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); |
168 | } | ||
169 | 269 | ||
170 | for (;;) { | 270 | while (page == NULL) { |
171 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); | 271 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); |
172 | 272 | ||
173 | drbd_kick_lo_and_reclaim_net(mdev); | 273 | drbd_kick_lo_and_reclaim_net(mdev); |
174 | 274 | ||
175 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | 275 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { |
176 | page = drbd_pp_first_page_or_try_alloc(mdev); | 276 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); |
177 | if (page) | 277 | if (page) |
178 | break; | 278 | break; |
179 | } | 279 | } |
@@ -190,62 +290,32 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) | |||
190 | } | 290 | } |
191 | finish_wait(&drbd_pp_wait, &wait); | 291 | finish_wait(&drbd_pp_wait, &wait); |
192 | 292 | ||
293 | if (page) | ||
294 | atomic_add(number, &mdev->pp_in_use); | ||
193 | return page; | 295 | return page; |
194 | } | 296 | } |
195 | 297 | ||
196 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. | 298 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. |
197 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ | 299 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock); |
300 | * Either links the page chain back to the global pool, | ||
301 | * or returns all pages to the system. */ | ||
198 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) | 302 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) |
199 | { | 303 | { |
200 | int free_it; | ||
201 | |||
202 | spin_lock(&drbd_pp_lock); | ||
203 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | ||
204 | free_it = 1; | ||
205 | } else { | ||
206 | set_page_private(page, (unsigned long)drbd_pp_pool); | ||
207 | drbd_pp_pool = page; | ||
208 | drbd_pp_vacant++; | ||
209 | free_it = 0; | ||
210 | } | ||
211 | spin_unlock(&drbd_pp_lock); | ||
212 | |||
213 | atomic_dec(&mdev->pp_in_use); | ||
214 | |||
215 | if (free_it) | ||
216 | __free_page(page); | ||
217 | |||
218 | wake_up(&drbd_pp_wait); | ||
219 | } | ||
220 | |||
221 | static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) | ||
222 | { | ||
223 | struct page *p_to_be_freed = NULL; | ||
224 | struct page *page; | ||
225 | struct bio_vec *bvec; | ||
226 | int i; | 304 | int i; |
227 | 305 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) | |
228 | spin_lock(&drbd_pp_lock); | 306 | i = page_chain_free(page); |
229 | __bio_for_each_segment(bvec, bio, i, 0) { | 307 | else { |
230 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | 308 | struct page *tmp; |
231 | set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); | 309 | tmp = page_chain_tail(page, &i); |
232 | p_to_be_freed = bvec->bv_page; | 310 | spin_lock(&drbd_pp_lock); |
233 | } else { | 311 | page_chain_add(&drbd_pp_pool, page, tmp); |
234 | set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); | 312 | drbd_pp_vacant += i; |
235 | drbd_pp_pool = bvec->bv_page; | 313 | spin_unlock(&drbd_pp_lock); |
236 | drbd_pp_vacant++; | ||
237 | } | ||
238 | } | ||
239 | spin_unlock(&drbd_pp_lock); | ||
240 | atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); | ||
241 | |||
242 | while (p_to_be_freed) { | ||
243 | page = p_to_be_freed; | ||
244 | p_to_be_freed = (struct page *)page_private(page); | ||
245 | set_page_private(page, 0); /* just to be polite */ | ||
246 | put_page(page); | ||
247 | } | 314 | } |
248 | 315 | atomic_sub(i, &mdev->pp_in_use); | |
316 | i = atomic_read(&mdev->pp_in_use); | ||
317 | if (i < 0) | ||
318 | dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); | ||
249 | wake_up(&drbd_pp_wait); | 319 | wake_up(&drbd_pp_wait); |
250 | } | 320 | } |
251 | 321 | ||
@@ -270,11 +340,9 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | |||
270 | unsigned int data_size, | 340 | unsigned int data_size, |
271 | gfp_t gfp_mask) __must_hold(local) | 341 | gfp_t gfp_mask) __must_hold(local) |
272 | { | 342 | { |
273 | struct request_queue *q; | ||
274 | struct drbd_epoch_entry *e; | 343 | struct drbd_epoch_entry *e; |
275 | struct page *page; | 344 | struct page *page; |
276 | struct bio *bio; | 345 | unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; |
277 | unsigned int ds; | ||
278 | 346 | ||
279 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) | 347 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) |
280 | return NULL; | 348 | return NULL; |
@@ -286,84 +354,32 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | |||
286 | return NULL; | 354 | return NULL; |
287 | } | 355 | } |
288 | 356 | ||
289 | bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); | 357 | page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); |
290 | if (!bio) { | 358 | if (!page) |
291 | if (!(gfp_mask & __GFP_NOWARN)) | 359 | goto fail; |
292 | dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); | ||
293 | goto fail1; | ||
294 | } | ||
295 | |||
296 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
297 | bio->bi_sector = sector; | ||
298 | |||
299 | ds = data_size; | ||
300 | while (ds) { | ||
301 | page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); | ||
302 | if (!page) { | ||
303 | if (!(gfp_mask & __GFP_NOWARN)) | ||
304 | dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); | ||
305 | goto fail2; | ||
306 | } | ||
307 | if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { | ||
308 | drbd_pp_free(mdev, page); | ||
309 | dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," | ||
310 | "data_size=%u,ds=%u) failed\n", | ||
311 | (unsigned long long)sector, data_size, ds); | ||
312 | |||
313 | q = bdev_get_queue(bio->bi_bdev); | ||
314 | if (q->merge_bvec_fn) { | ||
315 | struct bvec_merge_data bvm = { | ||
316 | .bi_bdev = bio->bi_bdev, | ||
317 | .bi_sector = bio->bi_sector, | ||
318 | .bi_size = bio->bi_size, | ||
319 | .bi_rw = bio->bi_rw, | ||
320 | }; | ||
321 | int l = q->merge_bvec_fn(q, &bvm, | ||
322 | &bio->bi_io_vec[bio->bi_vcnt]); | ||
323 | dev_err(DEV, "merge_bvec_fn() = %d\n", l); | ||
324 | } | ||
325 | |||
326 | /* dump more of the bio. */ | ||
327 | dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); | ||
328 | dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); | ||
329 | dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); | ||
330 | dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); | ||
331 | |||
332 | goto fail2; | ||
333 | break; | ||
334 | } | ||
335 | ds -= min_t(int, ds, PAGE_SIZE); | ||
336 | } | ||
337 | |||
338 | D_ASSERT(data_size == bio->bi_size); | ||
339 | |||
340 | bio->bi_private = e; | ||
341 | e->mdev = mdev; | ||
342 | e->sector = sector; | ||
343 | e->size = bio->bi_size; | ||
344 | 360 | ||
345 | e->private_bio = bio; | ||
346 | e->block_id = id; | ||
347 | INIT_HLIST_NODE(&e->colision); | 361 | INIT_HLIST_NODE(&e->colision); |
348 | e->epoch = NULL; | 362 | e->epoch = NULL; |
363 | e->mdev = mdev; | ||
364 | e->pages = page; | ||
365 | atomic_set(&e->pending_bios, 0); | ||
366 | e->size = data_size; | ||
349 | e->flags = 0; | 367 | e->flags = 0; |
368 | e->sector = sector; | ||
369 | e->sector = sector; | ||
370 | e->block_id = id; | ||
350 | 371 | ||
351 | return e; | 372 | return e; |
352 | 373 | ||
353 | fail2: | 374 | fail: |
354 | drbd_pp_free_bio_pages(mdev, bio); | ||
355 | bio_put(bio); | ||
356 | fail1: | ||
357 | mempool_free(e, drbd_ee_mempool); | 375 | mempool_free(e, drbd_ee_mempool); |
358 | |||
359 | return NULL; | 376 | return NULL; |
360 | } | 377 | } |
361 | 378 | ||
362 | void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 379 | void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) |
363 | { | 380 | { |
364 | struct bio *bio = e->private_bio; | 381 | drbd_pp_free(mdev, e->pages); |
365 | drbd_pp_free_bio_pages(mdev, bio); | 382 | D_ASSERT(atomic_read(&e->pending_bios) == 0); |
366 | bio_put(bio); | ||
367 | D_ASSERT(hlist_unhashed(&e->colision)); | 383 | D_ASSERT(hlist_unhashed(&e->colision)); |
368 | mempool_free(e, drbd_ee_mempool); | 384 | mempool_free(e, drbd_ee_mempool); |
369 | } | 385 | } |
@@ -902,7 +918,7 @@ retry: | |||
902 | if (!drbd_send_protocol(mdev)) | 918 | if (!drbd_send_protocol(mdev)) |
903 | return -1; | 919 | return -1; |
904 | drbd_send_sync_param(mdev, &mdev->sync_conf); | 920 | drbd_send_sync_param(mdev, &mdev->sync_conf); |
905 | drbd_send_sizes(mdev, 0); | 921 | drbd_send_sizes(mdev, 0, 0); |
906 | drbd_send_uuids(mdev); | 922 | drbd_send_uuids(mdev); |
907 | drbd_send_state(mdev); | 923 | drbd_send_state(mdev); |
908 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | 924 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); |
@@ -946,7 +962,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d | |||
946 | int rv; | 962 | int rv; |
947 | 963 | ||
948 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | 964 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { |
949 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); | 965 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, |
966 | NULL, BLKDEV_IFL_WAIT); | ||
950 | if (rv) { | 967 | if (rv) { |
951 | dev_err(DEV, "local disk flush failed with status %d\n", rv); | 968 | dev_err(DEV, "local disk flush failed with status %d\n", rv); |
952 | /* would rather check on EOPNOTSUPP, but that is not reliable. | 969 | /* would rather check on EOPNOTSUPP, but that is not reliable. |
@@ -1120,6 +1137,101 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) | |||
1120 | } | 1137 | } |
1121 | 1138 | ||
1122 | /** | 1139 | /** |
1140 | * drbd_submit_ee() | ||
1141 | * @mdev: DRBD device. | ||
1142 | * @e: epoch entry | ||
1143 | * @rw: flag field, see bio->bi_rw | ||
1144 | */ | ||
1145 | /* TODO allocate from our own bio_set. */ | ||
1146 | int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | ||
1147 | const unsigned rw, const int fault_type) | ||
1148 | { | ||
1149 | struct bio *bios = NULL; | ||
1150 | struct bio *bio; | ||
1151 | struct page *page = e->pages; | ||
1152 | sector_t sector = e->sector; | ||
1153 | unsigned ds = e->size; | ||
1154 | unsigned n_bios = 0; | ||
1155 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; | ||
1156 | |||
1157 | if (atomic_read(&mdev->new_c_uuid)) { | ||
1158 | if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) { | ||
1159 | drbd_uuid_new_current(mdev); | ||
1160 | drbd_md_sync(mdev); | ||
1161 | |||
1162 | atomic_dec(&mdev->new_c_uuid); | ||
1163 | wake_up(&mdev->misc_wait); | ||
1164 | } | ||
1165 | wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid)); | ||
1166 | } | ||
1167 | |||
1168 | /* In most cases, we will only need one bio. But in case the lower | ||
1169 | * level restrictions happen to be different at this offset on this | ||
1170 | * side than those of the sending peer, we may need to submit the | ||
1171 | * request in more than one bio. */ | ||
1172 | next_bio: | ||
1173 | bio = bio_alloc(GFP_NOIO, nr_pages); | ||
1174 | if (!bio) { | ||
1175 | dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); | ||
1176 | goto fail; | ||
1177 | } | ||
1178 | /* > e->sector, unless this is the first bio */ | ||
1179 | bio->bi_sector = sector; | ||
1180 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1181 | /* we special case some flags in the multi-bio case, see below | ||
1182 | * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */ | ||
1183 | bio->bi_rw = rw; | ||
1184 | bio->bi_private = e; | ||
1185 | bio->bi_end_io = drbd_endio_sec; | ||
1186 | |||
1187 | bio->bi_next = bios; | ||
1188 | bios = bio; | ||
1189 | ++n_bios; | ||
1190 | |||
1191 | page_chain_for_each(page) { | ||
1192 | unsigned len = min_t(unsigned, ds, PAGE_SIZE); | ||
1193 | if (!bio_add_page(bio, page, len, 0)) { | ||
1194 | /* a single page must always be possible! */ | ||
1195 | BUG_ON(bio->bi_vcnt == 0); | ||
1196 | goto next_bio; | ||
1197 | } | ||
1198 | ds -= len; | ||
1199 | sector += len >> 9; | ||
1200 | --nr_pages; | ||
1201 | } | ||
1202 | D_ASSERT(page == NULL); | ||
1203 | D_ASSERT(ds == 0); | ||
1204 | |||
1205 | atomic_set(&e->pending_bios, n_bios); | ||
1206 | do { | ||
1207 | bio = bios; | ||
1208 | bios = bios->bi_next; | ||
1209 | bio->bi_next = NULL; | ||
1210 | |||
1211 | /* strip off BIO_RW_UNPLUG unless it is the last bio */ | ||
1212 | if (bios) | ||
1213 | bio->bi_rw &= ~(1<<BIO_RW_UNPLUG); | ||
1214 | |||
1215 | drbd_generic_make_request(mdev, fault_type, bio); | ||
1216 | |||
1217 | /* strip off BIO_RW_BARRIER, | ||
1218 | * unless it is the first or last bio */ | ||
1219 | if (bios && bios->bi_next) | ||
1220 | bios->bi_rw &= ~(1<<BIO_RW_BARRIER); | ||
1221 | } while (bios); | ||
1222 | maybe_kick_lo(mdev); | ||
1223 | return 0; | ||
1224 | |||
1225 | fail: | ||
1226 | while (bios) { | ||
1227 | bio = bios; | ||
1228 | bios = bios->bi_next; | ||
1229 | bio_put(bio); | ||
1230 | } | ||
1231 | return -ENOMEM; | ||
1232 | } | ||
1233 | |||
1234 | /** | ||
1123 | * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set | 1235 | * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set |
1124 | * @mdev: DRBD device. | 1236 | * @mdev: DRBD device. |
1125 | * @w: work object. | 1237 | * @w: work object. |
@@ -1128,8 +1240,6 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) | |||
1128 | int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) | 1240 | int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) |
1129 | { | 1241 | { |
1130 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1242 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; |
1131 | struct bio *bio = e->private_bio; | ||
1132 | |||
1133 | /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, | 1243 | /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, |
1134 | (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) | 1244 | (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) |
1135 | so that we can finish that epoch in drbd_may_finish_epoch(). | 1245 | so that we can finish that epoch in drbd_may_finish_epoch(). |
@@ -1143,33 +1253,17 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea | |||
1143 | if (previous_epoch(mdev, e->epoch)) | 1253 | if (previous_epoch(mdev, e->epoch)) |
1144 | dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); | 1254 | dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); |
1145 | 1255 | ||
1146 | /* prepare bio for re-submit, | ||
1147 | * re-init volatile members */ | ||
1148 | /* we still have a local reference, | 1256 | /* we still have a local reference, |
1149 | * get_ldev was done in receive_Data. */ | 1257 | * get_ldev was done in receive_Data. */ |
1150 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1151 | bio->bi_sector = e->sector; | ||
1152 | bio->bi_size = e->size; | ||
1153 | bio->bi_idx = 0; | ||
1154 | |||
1155 | bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1156 | bio->bi_flags |= 1 << BIO_UPTODATE; | ||
1157 | |||
1158 | /* don't know whether this is necessary: */ | ||
1159 | bio->bi_phys_segments = 0; | ||
1160 | bio->bi_next = NULL; | ||
1161 | |||
1162 | /* these should be unchanged: */ | ||
1163 | /* bio->bi_end_io = drbd_endio_write_sec; */ | ||
1164 | /* bio->bi_vcnt = whatever; */ | ||
1165 | 1258 | ||
1166 | e->w.cb = e_end_block; | 1259 | e->w.cb = e_end_block; |
1167 | 1260 | if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) { | |
1168 | /* This is no longer a barrier request. */ | 1261 | /* drbd_submit_ee fails for one reason only: |
1169 | bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); | 1262 | * if was not able to allocate sufficient bios. |
1170 | 1263 | * requeue, try again later. */ | |
1171 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); | 1264 | e->w.cb = w_e_reissue; |
1172 | 1265 | drbd_queue_work(&mdev->data.work, &e->w); | |
1266 | } | ||
1173 | return 1; | 1267 | return 1; |
1174 | } | 1268 | } |
1175 | 1269 | ||
@@ -1261,13 +1355,13 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) | |||
1261 | static struct drbd_epoch_entry * | 1355 | static struct drbd_epoch_entry * |
1262 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) | 1356 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) |
1263 | { | 1357 | { |
1358 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
1264 | struct drbd_epoch_entry *e; | 1359 | struct drbd_epoch_entry *e; |
1265 | struct bio_vec *bvec; | ||
1266 | struct page *page; | 1360 | struct page *page; |
1267 | struct bio *bio; | 1361 | int dgs, ds, rr; |
1268 | int dgs, ds, i, rr; | ||
1269 | void *dig_in = mdev->int_dig_in; | 1362 | void *dig_in = mdev->int_dig_in; |
1270 | void *dig_vv = mdev->int_dig_vv; | 1363 | void *dig_vv = mdev->int_dig_vv; |
1364 | unsigned long *data; | ||
1271 | 1365 | ||
1272 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | 1366 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? |
1273 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | 1367 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; |
@@ -1286,29 +1380,44 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ | |||
1286 | ERR_IF(data_size & 0x1ff) return NULL; | 1380 | ERR_IF(data_size & 0x1ff) return NULL; |
1287 | ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; | 1381 | ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; |
1288 | 1382 | ||
1383 | /* even though we trust out peer, | ||
1384 | * we sometimes have to double check. */ | ||
1385 | if (sector + (data_size>>9) > capacity) { | ||
1386 | dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n", | ||
1387 | (unsigned long long)capacity, | ||
1388 | (unsigned long long)sector, data_size); | ||
1389 | return NULL; | ||
1390 | } | ||
1391 | |||
1289 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 1392 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
1290 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 1393 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
1291 | * which in turn might block on the other node at this very place. */ | 1394 | * which in turn might block on the other node at this very place. */ |
1292 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); | 1395 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); |
1293 | if (!e) | 1396 | if (!e) |
1294 | return NULL; | 1397 | return NULL; |
1295 | bio = e->private_bio; | 1398 | |
1296 | ds = data_size; | 1399 | ds = data_size; |
1297 | bio_for_each_segment(bvec, bio, i) { | 1400 | page = e->pages; |
1298 | page = bvec->bv_page; | 1401 | page_chain_for_each(page) { |
1299 | rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); | 1402 | unsigned len = min_t(int, ds, PAGE_SIZE); |
1403 | data = kmap(page); | ||
1404 | rr = drbd_recv(mdev, data, len); | ||
1405 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) { | ||
1406 | dev_err(DEV, "Fault injection: Corrupting data on receive\n"); | ||
1407 | data[0] = data[0] ^ (unsigned long)-1; | ||
1408 | } | ||
1300 | kunmap(page); | 1409 | kunmap(page); |
1301 | if (rr != min_t(int, ds, PAGE_SIZE)) { | 1410 | if (rr != len) { |
1302 | drbd_free_ee(mdev, e); | 1411 | drbd_free_ee(mdev, e); |
1303 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | 1412 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", |
1304 | rr, min_t(int, ds, PAGE_SIZE)); | 1413 | rr, len); |
1305 | return NULL; | 1414 | return NULL; |
1306 | } | 1415 | } |
1307 | ds -= rr; | 1416 | ds -= rr; |
1308 | } | 1417 | } |
1309 | 1418 | ||
1310 | if (dgs) { | 1419 | if (dgs) { |
1311 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | 1420 | drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); |
1312 | if (memcmp(dig_in, dig_vv, dgs)) { | 1421 | if (memcmp(dig_in, dig_vv, dgs)) { |
1313 | dev_err(DEV, "Digest integrity check FAILED.\n"); | 1422 | dev_err(DEV, "Digest integrity check FAILED.\n"); |
1314 | drbd_bcast_ee(mdev, "digest failed", | 1423 | drbd_bcast_ee(mdev, "digest failed", |
@@ -1330,7 +1439,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size) | |||
1330 | int rr, rv = 1; | 1439 | int rr, rv = 1; |
1331 | void *data; | 1440 | void *data; |
1332 | 1441 | ||
1333 | page = drbd_pp_alloc(mdev, 1); | 1442 | if (!data_size) |
1443 | return TRUE; | ||
1444 | |||
1445 | page = drbd_pp_alloc(mdev, 1, 1); | ||
1334 | 1446 | ||
1335 | data = kmap(page); | 1447 | data = kmap(page); |
1336 | while (data_size) { | 1448 | while (data_size) { |
@@ -1394,7 +1506,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | |||
1394 | } | 1506 | } |
1395 | 1507 | ||
1396 | if (dgs) { | 1508 | if (dgs) { |
1397 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | 1509 | drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv); |
1398 | if (memcmp(dig_in, dig_vv, dgs)) { | 1510 | if (memcmp(dig_in, dig_vv, dgs)) { |
1399 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); | 1511 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); |
1400 | return 0; | 1512 | return 0; |
@@ -1415,7 +1527,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u | |||
1415 | 1527 | ||
1416 | D_ASSERT(hlist_unhashed(&e->colision)); | 1528 | D_ASSERT(hlist_unhashed(&e->colision)); |
1417 | 1529 | ||
1418 | if (likely(drbd_bio_uptodate(e->private_bio))) { | 1530 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { |
1419 | drbd_set_in_sync(mdev, sector, e->size); | 1531 | drbd_set_in_sync(mdev, sector, e->size); |
1420 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); | 1532 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); |
1421 | } else { | 1533 | } else { |
@@ -1434,30 +1546,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si | |||
1434 | struct drbd_epoch_entry *e; | 1546 | struct drbd_epoch_entry *e; |
1435 | 1547 | ||
1436 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); | 1548 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); |
1437 | if (!e) { | 1549 | if (!e) |
1438 | put_ldev(mdev); | 1550 | goto fail; |
1439 | return FALSE; | ||
1440 | } | ||
1441 | 1551 | ||
1442 | dec_rs_pending(mdev); | 1552 | dec_rs_pending(mdev); |
1443 | 1553 | ||
1444 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1445 | e->private_bio->bi_rw = WRITE; | ||
1446 | e->w.cb = e_end_resync_block; | ||
1447 | |||
1448 | inc_unacked(mdev); | 1554 | inc_unacked(mdev); |
1449 | /* corresponding dec_unacked() in e_end_resync_block() | 1555 | /* corresponding dec_unacked() in e_end_resync_block() |
1450 | * respective _drbd_clear_done_ee */ | 1556 | * respective _drbd_clear_done_ee */ |
1451 | 1557 | ||
1558 | e->w.cb = e_end_resync_block; | ||
1559 | |||
1452 | spin_lock_irq(&mdev->req_lock); | 1560 | spin_lock_irq(&mdev->req_lock); |
1453 | list_add(&e->w.list, &mdev->sync_ee); | 1561 | list_add(&e->w.list, &mdev->sync_ee); |
1454 | spin_unlock_irq(&mdev->req_lock); | 1562 | spin_unlock_irq(&mdev->req_lock); |
1455 | 1563 | ||
1456 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); | 1564 | if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) |
1457 | /* accounting done in endio */ | 1565 | return TRUE; |
1458 | 1566 | ||
1459 | maybe_kick_lo(mdev); | 1567 | drbd_free_ee(mdev, e); |
1460 | return TRUE; | 1568 | fail: |
1569 | put_ldev(mdev); | ||
1570 | return FALSE; | ||
1461 | } | 1571 | } |
1462 | 1572 | ||
1463 | static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) | 1573 | static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) |
@@ -1552,7 +1662,7 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1552 | } | 1662 | } |
1553 | 1663 | ||
1554 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { | 1664 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { |
1555 | if (likely(drbd_bio_uptodate(e->private_bio))) { | 1665 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { |
1556 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && | 1666 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && |
1557 | mdev->state.conn <= C_PAUSED_SYNC_T && | 1667 | mdev->state.conn <= C_PAUSED_SYNC_T && |
1558 | e->flags & EE_MAY_SET_IN_SYNC) ? | 1668 | e->flags & EE_MAY_SET_IN_SYNC) ? |
@@ -1698,7 +1808,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) | |||
1698 | return FALSE; | 1808 | return FALSE; |
1699 | } | 1809 | } |
1700 | 1810 | ||
1701 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1702 | e->w.cb = e_end_block; | 1811 | e->w.cb = e_end_block; |
1703 | 1812 | ||
1704 | spin_lock(&mdev->epoch_lock); | 1813 | spin_lock(&mdev->epoch_lock); |
@@ -1894,12 +2003,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) | |||
1894 | drbd_al_begin_io(mdev, e->sector); | 2003 | drbd_al_begin_io(mdev, e->sector); |
1895 | } | 2004 | } |
1896 | 2005 | ||
1897 | e->private_bio->bi_rw = rw; | 2006 | if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) |
1898 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); | 2007 | return TRUE; |
1899 | /* accounting done in endio */ | ||
1900 | |||
1901 | maybe_kick_lo(mdev); | ||
1902 | return TRUE; | ||
1903 | 2008 | ||
1904 | out_interrupted: | 2009 | out_interrupted: |
1905 | /* yes, the epoch_size now is imbalanced. | 2010 | /* yes, the epoch_size now is imbalanced. |
@@ -1945,7 +2050,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) | |||
1945 | "no local data.\n"); | 2050 | "no local data.\n"); |
1946 | drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : | 2051 | drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : |
1947 | P_NEG_RS_DREPLY , p); | 2052 | P_NEG_RS_DREPLY , p); |
1948 | return TRUE; | 2053 | return drbd_drain_block(mdev, h->length - brps); |
1949 | } | 2054 | } |
1950 | 2055 | ||
1951 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 2056 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
@@ -1957,9 +2062,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) | |||
1957 | return FALSE; | 2062 | return FALSE; |
1958 | } | 2063 | } |
1959 | 2064 | ||
1960 | e->private_bio->bi_rw = READ; | ||
1961 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
1962 | |||
1963 | switch (h->command) { | 2065 | switch (h->command) { |
1964 | case P_DATA_REQUEST: | 2066 | case P_DATA_REQUEST: |
1965 | e->w.cb = w_e_end_data_req; | 2067 | e->w.cb = w_e_end_data_req; |
@@ -2053,10 +2155,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) | |||
2053 | 2155 | ||
2054 | inc_unacked(mdev); | 2156 | inc_unacked(mdev); |
2055 | 2157 | ||
2056 | drbd_generic_make_request(mdev, fault_type, e->private_bio); | 2158 | if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) |
2057 | maybe_kick_lo(mdev); | 2159 | return TRUE; |
2058 | |||
2059 | return TRUE; | ||
2060 | 2160 | ||
2061 | out_free_e: | 2161 | out_free_e: |
2062 | kfree(di); | 2162 | kfree(di); |
@@ -2473,6 +2573,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2473 | hg > 0 ? "source" : "target"); | 2573 | hg > 0 ? "source" : "target"); |
2474 | } | 2574 | } |
2475 | 2575 | ||
2576 | if (abs(hg) == 100) | ||
2577 | drbd_khelper(mdev, "initial-split-brain"); | ||
2578 | |||
2476 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { | 2579 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { |
2477 | int pcount = (mdev->state.role == R_PRIMARY) | 2580 | int pcount = (mdev->state.role == R_PRIMARY) |
2478 | + (peer_role == R_PRIMARY); | 2581 | + (peer_role == R_PRIMARY); |
@@ -2518,7 +2621,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2518 | * after an attempted attach on a diskless node. | 2621 | * after an attempted attach on a diskless node. |
2519 | * We just refuse to attach -- well, we drop the "connection" | 2622 | * We just refuse to attach -- well, we drop the "connection" |
2520 | * to that disk, in a way... */ | 2623 | * to that disk, in a way... */ |
2521 | dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); | 2624 | dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n"); |
2522 | drbd_khelper(mdev, "split-brain"); | 2625 | drbd_khelper(mdev, "split-brain"); |
2523 | return C_MASK; | 2626 | return C_MASK; |
2524 | } | 2627 | } |
@@ -2849,7 +2952,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) | |||
2849 | unsigned int max_seg_s; | 2952 | unsigned int max_seg_s; |
2850 | sector_t p_size, p_usize, my_usize; | 2953 | sector_t p_size, p_usize, my_usize; |
2851 | int ldsc = 0; /* local disk size changed */ | 2954 | int ldsc = 0; /* local disk size changed */ |
2852 | enum drbd_conns nconn; | 2955 | enum dds_flags ddsf; |
2853 | 2956 | ||
2854 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | 2957 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; |
2855 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | 2958 | if (drbd_recv(mdev, h->payload, h->length) != h->length) |
@@ -2905,8 +3008,9 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) | |||
2905 | } | 3008 | } |
2906 | #undef min_not_zero | 3009 | #undef min_not_zero |
2907 | 3010 | ||
3011 | ddsf = be16_to_cpu(p->dds_flags); | ||
2908 | if (get_ldev(mdev)) { | 3012 | if (get_ldev(mdev)) { |
2909 | dd = drbd_determin_dev_size(mdev, 0); | 3013 | dd = drbd_determin_dev_size(mdev, ddsf); |
2910 | put_ldev(mdev); | 3014 | put_ldev(mdev); |
2911 | if (dd == dev_size_error) | 3015 | if (dd == dev_size_error) |
2912 | return FALSE; | 3016 | return FALSE; |
@@ -2916,33 +3020,21 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) | |||
2916 | drbd_set_my_capacity(mdev, p_size); | 3020 | drbd_set_my_capacity(mdev, p_size); |
2917 | } | 3021 | } |
2918 | 3022 | ||
2919 | if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
2920 | nconn = drbd_sync_handshake(mdev, | ||
2921 | mdev->state.peer, mdev->state.pdsk); | ||
2922 | put_ldev(mdev); | ||
2923 | |||
2924 | if (nconn == C_MASK) { | ||
2925 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2926 | return FALSE; | ||
2927 | } | ||
2928 | |||
2929 | if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { | ||
2930 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2931 | return FALSE; | ||
2932 | } | ||
2933 | } | ||
2934 | |||
2935 | if (get_ldev(mdev)) { | 3023 | if (get_ldev(mdev)) { |
2936 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | 3024 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { |
2937 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | 3025 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); |
2938 | ldsc = 1; | 3026 | ldsc = 1; |
2939 | } | 3027 | } |
2940 | 3028 | ||
2941 | max_seg_s = be32_to_cpu(p->max_segment_size); | 3029 | if (mdev->agreed_pro_version < 94) |
3030 | max_seg_s = be32_to_cpu(p->max_segment_size); | ||
3031 | else /* drbd 8.3.8 onwards */ | ||
3032 | max_seg_s = DRBD_MAX_SEGMENT_SIZE; | ||
3033 | |||
2942 | if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) | 3034 | if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) |
2943 | drbd_setup_queue_param(mdev, max_seg_s); | 3035 | drbd_setup_queue_param(mdev, max_seg_s); |
2944 | 3036 | ||
2945 | drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); | 3037 | drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type)); |
2946 | put_ldev(mdev); | 3038 | put_ldev(mdev); |
2947 | } | 3039 | } |
2948 | 3040 | ||
@@ -2951,14 +3043,17 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) | |||
2951 | drbd_get_capacity(mdev->this_bdev) || ldsc) { | 3043 | drbd_get_capacity(mdev->this_bdev) || ldsc) { |
2952 | /* we have different sizes, probably peer | 3044 | /* we have different sizes, probably peer |
2953 | * needs to know my new size... */ | 3045 | * needs to know my new size... */ |
2954 | drbd_send_sizes(mdev, 0); | 3046 | drbd_send_sizes(mdev, 0, ddsf); |
2955 | } | 3047 | } |
2956 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || | 3048 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || |
2957 | (dd == grew && mdev->state.conn == C_CONNECTED)) { | 3049 | (dd == grew && mdev->state.conn == C_CONNECTED)) { |
2958 | if (mdev->state.pdsk >= D_INCONSISTENT && | 3050 | if (mdev->state.pdsk >= D_INCONSISTENT && |
2959 | mdev->state.disk >= D_INCONSISTENT) | 3051 | mdev->state.disk >= D_INCONSISTENT) { |
2960 | resync_after_online_grow(mdev); | 3052 | if (ddsf & DDSF_NO_RESYNC) |
2961 | else | 3053 | dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n"); |
3054 | else | ||
3055 | resync_after_online_grow(mdev); | ||
3056 | } else | ||
2962 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | 3057 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); |
2963 | } | 3058 | } |
2964 | } | 3059 | } |
@@ -3490,6 +3585,92 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) | |||
3490 | return TRUE; | 3585 | return TRUE; |
3491 | } | 3586 | } |
3492 | 3587 | ||
3588 | static void timeval_sub_us(struct timeval* tv, unsigned int us) | ||
3589 | { | ||
3590 | tv->tv_sec -= us / 1000000; | ||
3591 | us = us % 1000000; | ||
3592 | if (tv->tv_usec > us) { | ||
3593 | tv->tv_usec += 1000000; | ||
3594 | tv->tv_sec--; | ||
3595 | } | ||
3596 | tv->tv_usec -= us; | ||
3597 | } | ||
3598 | |||
3599 | static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p) | ||
3600 | { | ||
3601 | struct delay_probe *dp; | ||
3602 | struct list_head *le; | ||
3603 | struct timeval now; | ||
3604 | int seq_num; | ||
3605 | int offset; | ||
3606 | int data_delay; | ||
3607 | |||
3608 | seq_num = be32_to_cpu(p->seq_num); | ||
3609 | offset = be32_to_cpu(p->offset); | ||
3610 | |||
3611 | spin_lock(&mdev->peer_seq_lock); | ||
3612 | if (!list_empty(&mdev->delay_probes)) { | ||
3613 | if (from == USE_DATA_SOCKET) | ||
3614 | le = mdev->delay_probes.next; | ||
3615 | else | ||
3616 | le = mdev->delay_probes.prev; | ||
3617 | |||
3618 | dp = list_entry(le, struct delay_probe, list); | ||
3619 | |||
3620 | if (dp->seq_num == seq_num) { | ||
3621 | list_del(le); | ||
3622 | spin_unlock(&mdev->peer_seq_lock); | ||
3623 | do_gettimeofday(&now); | ||
3624 | timeval_sub_us(&now, offset); | ||
3625 | data_delay = | ||
3626 | now.tv_usec - dp->time.tv_usec + | ||
3627 | (now.tv_sec - dp->time.tv_sec) * 1000000; | ||
3628 | |||
3629 | if (data_delay > 0) | ||
3630 | mdev->data_delay = data_delay; | ||
3631 | |||
3632 | kfree(dp); | ||
3633 | return; | ||
3634 | } | ||
3635 | |||
3636 | if (dp->seq_num > seq_num) { | ||
3637 | spin_unlock(&mdev->peer_seq_lock); | ||
3638 | dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n"); | ||
3639 | return; /* Do not alloca a struct delay_probe.... */ | ||
3640 | } | ||
3641 | } | ||
3642 | spin_unlock(&mdev->peer_seq_lock); | ||
3643 | |||
3644 | dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO); | ||
3645 | if (!dp) { | ||
3646 | dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n"); | ||
3647 | return; | ||
3648 | } | ||
3649 | |||
3650 | dp->seq_num = seq_num; | ||
3651 | do_gettimeofday(&dp->time); | ||
3652 | timeval_sub_us(&dp->time, offset); | ||
3653 | |||
3654 | spin_lock(&mdev->peer_seq_lock); | ||
3655 | if (from == USE_DATA_SOCKET) | ||
3656 | list_add(&dp->list, &mdev->delay_probes); | ||
3657 | else | ||
3658 | list_add_tail(&dp->list, &mdev->delay_probes); | ||
3659 | spin_unlock(&mdev->peer_seq_lock); | ||
3660 | } | ||
3661 | |||
3662 | static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h) | ||
3663 | { | ||
3664 | struct p_delay_probe *p = (struct p_delay_probe *)h; | ||
3665 | |||
3666 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
3667 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3668 | return FALSE; | ||
3669 | |||
3670 | got_delay_probe(mdev, USE_DATA_SOCKET, p); | ||
3671 | return TRUE; | ||
3672 | } | ||
3673 | |||
3493 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); | 3674 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); |
3494 | 3675 | ||
3495 | static drbd_cmd_handler_f drbd_default_handler[] = { | 3676 | static drbd_cmd_handler_f drbd_default_handler[] = { |
@@ -3513,6 +3694,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = { | |||
3513 | [P_OV_REQUEST] = receive_DataRequest, | 3694 | [P_OV_REQUEST] = receive_DataRequest, |
3514 | [P_OV_REPLY] = receive_DataRequest, | 3695 | [P_OV_REPLY] = receive_DataRequest, |
3515 | [P_CSUM_RS_REQUEST] = receive_DataRequest, | 3696 | [P_CSUM_RS_REQUEST] = receive_DataRequest, |
3697 | [P_DELAY_PROBE] = receive_delay_probe, | ||
3516 | /* anything missing from this table is in | 3698 | /* anything missing from this table is in |
3517 | * the asender_tbl, see get_asender_cmd */ | 3699 | * the asender_tbl, see get_asender_cmd */ |
3518 | [P_MAX_CMD] = NULL, | 3700 | [P_MAX_CMD] = NULL, |
@@ -3739,7 +3921,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3739 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); | 3921 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); |
3740 | i = atomic_read(&mdev->pp_in_use); | 3922 | i = atomic_read(&mdev->pp_in_use); |
3741 | if (i) | 3923 | if (i) |
3742 | dev_info(DEV, "pp_in_use = %u, expected 0\n", i); | 3924 | dev_info(DEV, "pp_in_use = %d, expected 0\n", i); |
3743 | 3925 | ||
3744 | D_ASSERT(list_empty(&mdev->read_ee)); | 3926 | D_ASSERT(list_empty(&mdev->read_ee)); |
3745 | D_ASSERT(list_empty(&mdev->active_ee)); | 3927 | D_ASSERT(list_empty(&mdev->active_ee)); |
@@ -4232,7 +4414,6 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) | |||
4232 | 4414 | ||
4233 | sector = be64_to_cpu(p->sector); | 4415 | sector = be64_to_cpu(p->sector); |
4234 | size = be32_to_cpu(p->blksize); | 4416 | size = be32_to_cpu(p->blksize); |
4235 | D_ASSERT(p->block_id == ID_SYNCER); | ||
4236 | 4417 | ||
4237 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 4418 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4238 | 4419 | ||
@@ -4290,6 +4471,14 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) | |||
4290 | return TRUE; | 4471 | return TRUE; |
4291 | } | 4472 | } |
4292 | 4473 | ||
4474 | static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h) | ||
4475 | { | ||
4476 | struct p_delay_probe *p = (struct p_delay_probe *)h; | ||
4477 | |||
4478 | got_delay_probe(mdev, USE_META_SOCKET, p); | ||
4479 | return TRUE; | ||
4480 | } | ||
4481 | |||
4293 | struct asender_cmd { | 4482 | struct asender_cmd { |
4294 | size_t pkt_size; | 4483 | size_t pkt_size; |
4295 | int (*process)(struct drbd_conf *mdev, struct p_header *h); | 4484 | int (*process)(struct drbd_conf *mdev, struct p_header *h); |
@@ -4314,6 +4503,7 @@ static struct asender_cmd *get_asender_cmd(int cmd) | |||
4314 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, | 4503 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, |
4315 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, | 4504 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, |
4316 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, | 4505 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, |
4506 | [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_delay_probe_m }, | ||
4317 | [P_MAX_CMD] = { 0, NULL }, | 4507 | [P_MAX_CMD] = { 0, NULL }, |
4318 | }; | 4508 | }; |
4319 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) | 4509 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index de81ab7b4627..3397f11d0ba9 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -722,6 +722,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) | |||
722 | struct drbd_request *req; | 722 | struct drbd_request *req; |
723 | int local, remote; | 723 | int local, remote; |
724 | int err = -EIO; | 724 | int err = -EIO; |
725 | int ret = 0; | ||
725 | 726 | ||
726 | /* allocate outside of all locks; */ | 727 | /* allocate outside of all locks; */ |
727 | req = drbd_req_new(mdev, bio); | 728 | req = drbd_req_new(mdev, bio); |
@@ -784,7 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) | |||
784 | (mdev->state.pdsk == D_INCONSISTENT && | 785 | (mdev->state.pdsk == D_INCONSISTENT && |
785 | mdev->state.conn >= C_CONNECTED)); | 786 | mdev->state.conn >= C_CONNECTED)); |
786 | 787 | ||
787 | if (!(local || remote)) { | 788 | if (!(local || remote) && !mdev->state.susp) { |
788 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | 789 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); |
789 | goto fail_free_complete; | 790 | goto fail_free_complete; |
790 | } | 791 | } |
@@ -810,6 +811,16 @@ allocate_barrier: | |||
810 | /* GOOD, everything prepared, grab the spin_lock */ | 811 | /* GOOD, everything prepared, grab the spin_lock */ |
811 | spin_lock_irq(&mdev->req_lock); | 812 | spin_lock_irq(&mdev->req_lock); |
812 | 813 | ||
814 | if (mdev->state.susp) { | ||
815 | /* If we got suspended, use the retry mechanism of | ||
816 | generic_make_request() to restart processing of this | ||
817 | bio. In the next call to drbd_make_request_26 | ||
818 | we sleep in inc_ap_bio() */ | ||
819 | ret = 1; | ||
820 | spin_unlock_irq(&mdev->req_lock); | ||
821 | goto fail_free_complete; | ||
822 | } | ||
823 | |||
813 | if (remote) { | 824 | if (remote) { |
814 | remote = (mdev->state.pdsk == D_UP_TO_DATE || | 825 | remote = (mdev->state.pdsk == D_UP_TO_DATE || |
815 | (mdev->state.pdsk == D_INCONSISTENT && | 826 | (mdev->state.pdsk == D_INCONSISTENT && |
@@ -947,12 +958,14 @@ fail_and_free_req: | |||
947 | req->private_bio = NULL; | 958 | req->private_bio = NULL; |
948 | put_ldev(mdev); | 959 | put_ldev(mdev); |
949 | } | 960 | } |
950 | bio_endio(bio, err); | 961 | if (!ret) |
962 | bio_endio(bio, err); | ||
963 | |||
951 | drbd_req_free(req); | 964 | drbd_req_free(req); |
952 | dec_ap_bio(mdev); | 965 | dec_ap_bio(mdev); |
953 | kfree(b); | 966 | kfree(b); |
954 | 967 | ||
955 | return 0; | 968 | return ret; |
956 | } | 969 | } |
957 | 970 | ||
958 | /* helper function for drbd_make_request | 971 | /* helper function for drbd_make_request |
@@ -962,11 +975,6 @@ fail_and_free_req: | |||
962 | */ | 975 | */ |
963 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) | 976 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) |
964 | { | 977 | { |
965 | /* Unconfigured */ | ||
966 | if (mdev->state.conn == C_DISCONNECTING && | ||
967 | mdev->state.disk == D_DISKLESS) | ||
968 | return 1; | ||
969 | |||
970 | if (mdev->state.role != R_PRIMARY && | 978 | if (mdev->state.role != R_PRIMARY && |
971 | (!allow_oos || is_write)) { | 979 | (!allow_oos || is_write)) { |
972 | if (__ratelimit(&drbd_ratelimit_state)) { | 980 | if (__ratelimit(&drbd_ratelimit_state)) { |
@@ -1070,15 +1078,21 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio) | |||
1070 | 1078 | ||
1071 | /* we need to get a "reference count" (ap_bio_cnt) | 1079 | /* we need to get a "reference count" (ap_bio_cnt) |
1072 | * to avoid races with the disconnect/reconnect/suspend code. | 1080 | * to avoid races with the disconnect/reconnect/suspend code. |
1073 | * In case we need to split the bio here, we need to get two references | 1081 | * In case we need to split the bio here, we need to get three references |
1074 | * atomically, otherwise we might deadlock when trying to submit the | 1082 | * atomically, otherwise we might deadlock when trying to submit the |
1075 | * second one! */ | 1083 | * second one! */ |
1076 | inc_ap_bio(mdev, 2); | 1084 | inc_ap_bio(mdev, 3); |
1077 | 1085 | ||
1078 | D_ASSERT(e_enr == s_enr + 1); | 1086 | D_ASSERT(e_enr == s_enr + 1); |
1079 | 1087 | ||
1080 | drbd_make_request_common(mdev, &bp->bio1); | 1088 | while (drbd_make_request_common(mdev, &bp->bio1)) |
1081 | drbd_make_request_common(mdev, &bp->bio2); | 1089 | inc_ap_bio(mdev, 1); |
1090 | |||
1091 | while (drbd_make_request_common(mdev, &bp->bio2)) | ||
1092 | inc_ap_bio(mdev, 1); | ||
1093 | |||
1094 | dec_ap_bio(mdev); | ||
1095 | |||
1082 | bio_pair_release(bp); | 1096 | bio_pair_release(bp); |
1083 | } | 1097 | } |
1084 | return 0; | 1098 | return 0; |
@@ -1115,7 +1129,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct | |||
1115 | } else if (limit && get_ldev(mdev)) { | 1129 | } else if (limit && get_ldev(mdev)) { |
1116 | struct request_queue * const b = | 1130 | struct request_queue * const b = |
1117 | mdev->ldev->backing_bdev->bd_disk->queue; | 1131 | mdev->ldev->backing_bdev->bd_disk->queue; |
1118 | if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { | 1132 | if (b->merge_bvec_fn) { |
1119 | backing_limit = b->merge_bvec_fn(b, bvm, bvec); | 1133 | backing_limit = b->merge_bvec_fn(b, bvm, bvec); |
1120 | limit = min(limit, backing_limit); | 1134 | limit = min(limit, backing_limit); |
1121 | } | 1135 | } |
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 76863e3f05be..85179e1fb50a 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -70,7 +70,7 @@ static const char *drbd_disk_s_names[] = { | |||
70 | 70 | ||
71 | static const char *drbd_state_sw_errors[] = { | 71 | static const char *drbd_state_sw_errors[] = { |
72 | [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", | 72 | [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", |
73 | [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", | 73 | [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", |
74 | [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", | 74 | [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", |
75 | [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", | 75 | [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", |
76 | [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", | 76 | [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index d48a1dfd7b24..727ff6339754 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -47,8 +47,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
47 | 47 | ||
48 | /* defined here: | 48 | /* defined here: |
49 | drbd_md_io_complete | 49 | drbd_md_io_complete |
50 | drbd_endio_write_sec | 50 | drbd_endio_sec |
51 | drbd_endio_read_sec | ||
52 | drbd_endio_pri | 51 | drbd_endio_pri |
53 | 52 | ||
54 | * more endio handlers: | 53 | * more endio handlers: |
@@ -85,27 +84,10 @@ void drbd_md_io_complete(struct bio *bio, int error) | |||
85 | /* reads on behalf of the partner, | 84 | /* reads on behalf of the partner, |
86 | * "submitted" by the receiver | 85 | * "submitted" by the receiver |
87 | */ | 86 | */ |
88 | void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) | 87 | void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) |
89 | { | 88 | { |
90 | unsigned long flags = 0; | 89 | unsigned long flags = 0; |
91 | struct drbd_epoch_entry *e = NULL; | 90 | struct drbd_conf *mdev = e->mdev; |
92 | struct drbd_conf *mdev; | ||
93 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
94 | |||
95 | e = bio->bi_private; | ||
96 | mdev = e->mdev; | ||
97 | |||
98 | if (error) | ||
99 | dev_warn(DEV, "read: error=%d s=%llus\n", error, | ||
100 | (unsigned long long)e->sector); | ||
101 | if (!error && !uptodate) { | ||
102 | dev_warn(DEV, "read: setting error to -EIO s=%llus\n", | ||
103 | (unsigned long long)e->sector); | ||
104 | /* strange behavior of some lower level drivers... | ||
105 | * fail the request by clearing the uptodate flag, | ||
106 | * but do not return any error?! */ | ||
107 | error = -EIO; | ||
108 | } | ||
109 | 91 | ||
110 | D_ASSERT(e->block_id != ID_VACANT); | 92 | D_ASSERT(e->block_id != ID_VACANT); |
111 | 93 | ||
@@ -114,49 +96,38 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) | |||
114 | list_del(&e->w.list); | 96 | list_del(&e->w.list); |
115 | if (list_empty(&mdev->read_ee)) | 97 | if (list_empty(&mdev->read_ee)) |
116 | wake_up(&mdev->ee_wait); | 98 | wake_up(&mdev->ee_wait); |
99 | if (test_bit(__EE_WAS_ERROR, &e->flags)) | ||
100 | __drbd_chk_io_error(mdev, FALSE); | ||
117 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 101 | spin_unlock_irqrestore(&mdev->req_lock, flags); |
118 | 102 | ||
119 | drbd_chk_io_error(mdev, error, FALSE); | ||
120 | drbd_queue_work(&mdev->data.work, &e->w); | 103 | drbd_queue_work(&mdev->data.work, &e->w); |
121 | put_ldev(mdev); | 104 | put_ldev(mdev); |
122 | } | 105 | } |
123 | 106 | ||
107 | static int is_failed_barrier(int ee_flags) | ||
108 | { | ||
109 | return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED)) | ||
110 | == (EE_IS_BARRIER|EE_WAS_ERROR); | ||
111 | } | ||
112 | |||
124 | /* writes on behalf of the partner, or resync writes, | 113 | /* writes on behalf of the partner, or resync writes, |
125 | * "submitted" by the receiver. | 114 | * "submitted" by the receiver, final stage. */ |
126 | */ | 115 | static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) |
127 | void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) | ||
128 | { | 116 | { |
129 | unsigned long flags = 0; | 117 | unsigned long flags = 0; |
130 | struct drbd_epoch_entry *e = NULL; | 118 | struct drbd_conf *mdev = e->mdev; |
131 | struct drbd_conf *mdev; | ||
132 | sector_t e_sector; | 119 | sector_t e_sector; |
133 | int do_wake; | 120 | int do_wake; |
134 | int is_syncer_req; | 121 | int is_syncer_req; |
135 | int do_al_complete_io; | 122 | int do_al_complete_io; |
136 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
137 | int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); | ||
138 | |||
139 | e = bio->bi_private; | ||
140 | mdev = e->mdev; | ||
141 | 123 | ||
142 | if (error) | 124 | /* if this is a failed barrier request, disable use of barriers, |
143 | dev_warn(DEV, "write: error=%d s=%llus\n", error, | 125 | * and schedule for resubmission */ |
144 | (unsigned long long)e->sector); | 126 | if (is_failed_barrier(e->flags)) { |
145 | if (!error && !uptodate) { | ||
146 | dev_warn(DEV, "write: setting error to -EIO s=%llus\n", | ||
147 | (unsigned long long)e->sector); | ||
148 | /* strange behavior of some lower level drivers... | ||
149 | * fail the request by clearing the uptodate flag, | ||
150 | * but do not return any error?! */ | ||
151 | error = -EIO; | ||
152 | } | ||
153 | |||
154 | /* error == -ENOTSUPP would be a better test, | ||
155 | * alas it is not reliable */ | ||
156 | if (error && is_barrier && e->flags & EE_IS_BARRIER) { | ||
157 | drbd_bump_write_ordering(mdev, WO_bdev_flush); | 127 | drbd_bump_write_ordering(mdev, WO_bdev_flush); |
158 | spin_lock_irqsave(&mdev->req_lock, flags); | 128 | spin_lock_irqsave(&mdev->req_lock, flags); |
159 | list_del(&e->w.list); | 129 | list_del(&e->w.list); |
130 | e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED; | ||
160 | e->w.cb = w_e_reissue; | 131 | e->w.cb = w_e_reissue; |
161 | /* put_ldev actually happens below, once we come here again. */ | 132 | /* put_ldev actually happens below, once we come here again. */ |
162 | __release(local); | 133 | __release(local); |
@@ -167,17 +138,16 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) | |||
167 | 138 | ||
168 | D_ASSERT(e->block_id != ID_VACANT); | 139 | D_ASSERT(e->block_id != ID_VACANT); |
169 | 140 | ||
170 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
171 | mdev->writ_cnt += e->size >> 9; | ||
172 | is_syncer_req = is_syncer_block_id(e->block_id); | ||
173 | |||
174 | /* after we moved e to done_ee, | 141 | /* after we moved e to done_ee, |
175 | * we may no longer access it, | 142 | * we may no longer access it, |
176 | * it may be freed/reused already! | 143 | * it may be freed/reused already! |
177 | * (as soon as we release the req_lock) */ | 144 | * (as soon as we release the req_lock) */ |
178 | e_sector = e->sector; | 145 | e_sector = e->sector; |
179 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; | 146 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; |
147 | is_syncer_req = is_syncer_block_id(e->block_id); | ||
180 | 148 | ||
149 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
150 | mdev->writ_cnt += e->size >> 9; | ||
181 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ | 151 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ |
182 | list_add_tail(&e->w.list, &mdev->done_ee); | 152 | list_add_tail(&e->w.list, &mdev->done_ee); |
183 | 153 | ||
@@ -190,7 +160,7 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) | |||
190 | ? list_empty(&mdev->sync_ee) | 160 | ? list_empty(&mdev->sync_ee) |
191 | : list_empty(&mdev->active_ee); | 161 | : list_empty(&mdev->active_ee); |
192 | 162 | ||
193 | if (error) | 163 | if (test_bit(__EE_WAS_ERROR, &e->flags)) |
194 | __drbd_chk_io_error(mdev, FALSE); | 164 | __drbd_chk_io_error(mdev, FALSE); |
195 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 165 | spin_unlock_irqrestore(&mdev->req_lock, flags); |
196 | 166 | ||
@@ -205,7 +175,42 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) | |||
205 | 175 | ||
206 | wake_asender(mdev); | 176 | wake_asender(mdev); |
207 | put_ldev(mdev); | 177 | put_ldev(mdev); |
178 | } | ||
179 | |||
180 | /* writes on behalf of the partner, or resync writes, | ||
181 | * "submitted" by the receiver. | ||
182 | */ | ||
183 | void drbd_endio_sec(struct bio *bio, int error) | ||
184 | { | ||
185 | struct drbd_epoch_entry *e = bio->bi_private; | ||
186 | struct drbd_conf *mdev = e->mdev; | ||
187 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
188 | int is_write = bio_data_dir(bio) == WRITE; | ||
189 | |||
190 | if (error) | ||
191 | dev_warn(DEV, "%s: error=%d s=%llus\n", | ||
192 | is_write ? "write" : "read", error, | ||
193 | (unsigned long long)e->sector); | ||
194 | if (!error && !uptodate) { | ||
195 | dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", | ||
196 | is_write ? "write" : "read", | ||
197 | (unsigned long long)e->sector); | ||
198 | /* strange behavior of some lower level drivers... | ||
199 | * fail the request by clearing the uptodate flag, | ||
200 | * but do not return any error?! */ | ||
201 | error = -EIO; | ||
202 | } | ||
203 | |||
204 | if (error) | ||
205 | set_bit(__EE_WAS_ERROR, &e->flags); | ||
208 | 206 | ||
207 | bio_put(bio); /* no need for the bio anymore */ | ||
208 | if (atomic_dec_and_test(&e->pending_bios)) { | ||
209 | if (is_write) | ||
210 | drbd_endio_write_sec_final(e); | ||
211 | else | ||
212 | drbd_endio_read_sec_final(e); | ||
213 | } | ||
209 | } | 214 | } |
210 | 215 | ||
211 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request | 216 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request |
@@ -295,7 +300,34 @@ int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
295 | return 1; /* Simply ignore this! */ | 300 | return 1; /* Simply ignore this! */ |
296 | } | 301 | } |
297 | 302 | ||
298 | void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) | 303 | void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) |
304 | { | ||
305 | struct hash_desc desc; | ||
306 | struct scatterlist sg; | ||
307 | struct page *page = e->pages; | ||
308 | struct page *tmp; | ||
309 | unsigned len; | ||
310 | |||
311 | desc.tfm = tfm; | ||
312 | desc.flags = 0; | ||
313 | |||
314 | sg_init_table(&sg, 1); | ||
315 | crypto_hash_init(&desc); | ||
316 | |||
317 | while ((tmp = page_chain_next(page))) { | ||
318 | /* all but the last page will be fully used */ | ||
319 | sg_set_page(&sg, page, PAGE_SIZE, 0); | ||
320 | crypto_hash_update(&desc, &sg, sg.length); | ||
321 | page = tmp; | ||
322 | } | ||
323 | /* and now the last, possibly only partially used page */ | ||
324 | len = e->size & (PAGE_SIZE - 1); | ||
325 | sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); | ||
326 | crypto_hash_update(&desc, &sg, sg.length); | ||
327 | crypto_hash_final(&desc, digest); | ||
328 | } | ||
329 | |||
330 | void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) | ||
299 | { | 331 | { |
300 | struct hash_desc desc; | 332 | struct hash_desc desc; |
301 | struct scatterlist sg; | 333 | struct scatterlist sg; |
@@ -329,11 +361,11 @@ static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel | |||
329 | return 1; | 361 | return 1; |
330 | } | 362 | } |
331 | 363 | ||
332 | if (likely(drbd_bio_uptodate(e->private_bio))) { | 364 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { |
333 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | 365 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); |
334 | digest = kmalloc(digest_size, GFP_NOIO); | 366 | digest = kmalloc(digest_size, GFP_NOIO); |
335 | if (digest) { | 367 | if (digest) { |
336 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | 368 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); |
337 | 369 | ||
338 | inc_rs_pending(mdev); | 370 | inc_rs_pending(mdev); |
339 | ok = drbd_send_drequest_csum(mdev, | 371 | ok = drbd_send_drequest_csum(mdev, |
@@ -369,23 +401,21 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | |||
369 | /* GFP_TRY, because if there is no memory available right now, this may | 401 | /* GFP_TRY, because if there is no memory available right now, this may |
370 | * be rescheduled for later. It is "only" background resync, after all. */ | 402 | * be rescheduled for later. It is "only" background resync, after all. */ |
371 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); | 403 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); |
372 | if (!e) { | 404 | if (!e) |
373 | put_ldev(mdev); | 405 | goto fail; |
374 | return 2; | ||
375 | } | ||
376 | 406 | ||
377 | spin_lock_irq(&mdev->req_lock); | 407 | spin_lock_irq(&mdev->req_lock); |
378 | list_add(&e->w.list, &mdev->read_ee); | 408 | list_add(&e->w.list, &mdev->read_ee); |
379 | spin_unlock_irq(&mdev->req_lock); | 409 | spin_unlock_irq(&mdev->req_lock); |
380 | 410 | ||
381 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
382 | e->private_bio->bi_rw = READ; | ||
383 | e->w.cb = w_e_send_csum; | 411 | e->w.cb = w_e_send_csum; |
412 | if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) | ||
413 | return 1; | ||
384 | 414 | ||
385 | mdev->read_cnt += size >> 9; | 415 | drbd_free_ee(mdev, e); |
386 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); | 416 | fail: |
387 | 417 | put_ldev(mdev); | |
388 | return 1; | 418 | return 2; |
389 | } | 419 | } |
390 | 420 | ||
391 | void resync_timer_fn(unsigned long data) | 421 | void resync_timer_fn(unsigned long data) |
@@ -414,13 +444,25 @@ void resync_timer_fn(unsigned long data) | |||
414 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); | 444 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); |
415 | } | 445 | } |
416 | 446 | ||
447 | static int calc_resync_rate(struct drbd_conf *mdev) | ||
448 | { | ||
449 | int d = mdev->data_delay / 1000; /* us -> ms */ | ||
450 | int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */ | ||
451 | int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */ | ||
452 | int cr = mdev->sync_conf.rate; | ||
453 | |||
454 | return d <= td ? cr : | ||
455 | d >= hd ? 0 : | ||
456 | cr + (cr * (td - d) / (hd - td)); | ||
457 | } | ||
458 | |||
417 | int w_make_resync_request(struct drbd_conf *mdev, | 459 | int w_make_resync_request(struct drbd_conf *mdev, |
418 | struct drbd_work *w, int cancel) | 460 | struct drbd_work *w, int cancel) |
419 | { | 461 | { |
420 | unsigned long bit; | 462 | unsigned long bit; |
421 | sector_t sector; | 463 | sector_t sector; |
422 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 464 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
423 | int max_segment_size = queue_max_segment_size(mdev->rq_queue); | 465 | int max_segment_size; |
424 | int number, i, size, pe, mx; | 466 | int number, i, size, pe, mx; |
425 | int align, queued, sndbuf; | 467 | int align, queued, sndbuf; |
426 | 468 | ||
@@ -446,7 +488,13 @@ int w_make_resync_request(struct drbd_conf *mdev, | |||
446 | return 1; | 488 | return 1; |
447 | } | 489 | } |
448 | 490 | ||
449 | number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); | 491 | /* starting with drbd 8.3.8, we can handle multi-bio EEs, |
492 | * if it should be necessary */ | ||
493 | max_segment_size = mdev->agreed_pro_version < 94 ? | ||
494 | queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; | ||
495 | |||
496 | mdev->c_sync_rate = calc_resync_rate(mdev); | ||
497 | number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); | ||
450 | pe = atomic_read(&mdev->rs_pending_cnt); | 498 | pe = atomic_read(&mdev->rs_pending_cnt); |
451 | 499 | ||
452 | mutex_lock(&mdev->data.mutex); | 500 | mutex_lock(&mdev->data.mutex); |
@@ -509,12 +557,6 @@ next_sector: | |||
509 | * | 557 | * |
510 | * Additionally always align bigger requests, in order to | 558 | * Additionally always align bigger requests, in order to |
511 | * be prepared for all stripe sizes of software RAIDs. | 559 | * be prepared for all stripe sizes of software RAIDs. |
512 | * | ||
513 | * we _do_ care about the agreed-upon q->max_segment_size | ||
514 | * here, as splitting up the requests on the other side is more | ||
515 | * difficult. the consequence is, that on lvm and md and other | ||
516 | * "indirect" devices, this is dead code, since | ||
517 | * q->max_segment_size will be PAGE_SIZE. | ||
518 | */ | 560 | */ |
519 | align = 1; | 561 | align = 1; |
520 | for (;;) { | 562 | for (;;) { |
@@ -806,7 +848,7 @@ out: | |||
806 | /* helper */ | 848 | /* helper */ |
807 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 849 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) |
808 | { | 850 | { |
809 | if (drbd_bio_has_active_page(e->private_bio)) { | 851 | if (drbd_ee_has_active_page(e)) { |
810 | /* This might happen if sendpage() has not finished */ | 852 | /* This might happen if sendpage() has not finished */ |
811 | spin_lock_irq(&mdev->req_lock); | 853 | spin_lock_irq(&mdev->req_lock); |
812 | list_add_tail(&e->w.list, &mdev->net_ee); | 854 | list_add_tail(&e->w.list, &mdev->net_ee); |
@@ -832,7 +874,7 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
832 | return 1; | 874 | return 1; |
833 | } | 875 | } |
834 | 876 | ||
835 | if (likely(drbd_bio_uptodate(e->private_bio))) { | 877 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { |
836 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); | 878 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); |
837 | } else { | 879 | } else { |
838 | if (__ratelimit(&drbd_ratelimit_state)) | 880 | if (__ratelimit(&drbd_ratelimit_state)) |
@@ -873,7 +915,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
873 | put_ldev(mdev); | 915 | put_ldev(mdev); |
874 | } | 916 | } |
875 | 917 | ||
876 | if (likely(drbd_bio_uptodate(e->private_bio))) { | 918 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { |
877 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { | 919 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { |
878 | inc_rs_pending(mdev); | 920 | inc_rs_pending(mdev); |
879 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | 921 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); |
@@ -921,7 +963,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
921 | 963 | ||
922 | di = (struct digest_info *)(unsigned long)e->block_id; | 964 | di = (struct digest_info *)(unsigned long)e->block_id; |
923 | 965 | ||
924 | if (likely(drbd_bio_uptodate(e->private_bio))) { | 966 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { |
925 | /* quick hack to try to avoid a race against reconfiguration. | 967 | /* quick hack to try to avoid a race against reconfiguration. |
926 | * a real fix would be much more involved, | 968 | * a real fix would be much more involved, |
927 | * introducing more locking mechanisms */ | 969 | * introducing more locking mechanisms */ |
@@ -931,7 +973,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
931 | digest = kmalloc(digest_size, GFP_NOIO); | 973 | digest = kmalloc(digest_size, GFP_NOIO); |
932 | } | 974 | } |
933 | if (digest) { | 975 | if (digest) { |
934 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | 976 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); |
935 | eq = !memcmp(digest, di->digest, digest_size); | 977 | eq = !memcmp(digest, di->digest, digest_size); |
936 | kfree(digest); | 978 | kfree(digest); |
937 | } | 979 | } |
@@ -973,14 +1015,14 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
973 | if (unlikely(cancel)) | 1015 | if (unlikely(cancel)) |
974 | goto out; | 1016 | goto out; |
975 | 1017 | ||
976 | if (unlikely(!drbd_bio_uptodate(e->private_bio))) | 1018 | if (unlikely((e->flags & EE_WAS_ERROR) != 0)) |
977 | goto out; | 1019 | goto out; |
978 | 1020 | ||
979 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1021 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); |
980 | /* FIXME if this allocation fails, online verify will not terminate! */ | 1022 | /* FIXME if this allocation fails, online verify will not terminate! */ |
981 | digest = kmalloc(digest_size, GFP_NOIO); | 1023 | digest = kmalloc(digest_size, GFP_NOIO); |
982 | if (digest) { | 1024 | if (digest) { |
983 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | 1025 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); |
984 | inc_rs_pending(mdev); | 1026 | inc_rs_pending(mdev); |
985 | ok = drbd_send_drequest_csum(mdev, e->sector, e->size, | 1027 | ok = drbd_send_drequest_csum(mdev, e->sector, e->size, |
986 | digest, digest_size, P_OV_REPLY); | 1028 | digest, digest_size, P_OV_REPLY); |
@@ -1029,11 +1071,11 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1029 | 1071 | ||
1030 | di = (struct digest_info *)(unsigned long)e->block_id; | 1072 | di = (struct digest_info *)(unsigned long)e->block_id; |
1031 | 1073 | ||
1032 | if (likely(drbd_bio_uptodate(e->private_bio))) { | 1074 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { |
1033 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1075 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); |
1034 | digest = kmalloc(digest_size, GFP_NOIO); | 1076 | digest = kmalloc(digest_size, GFP_NOIO); |
1035 | if (digest) { | 1077 | if (digest) { |
1036 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | 1078 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); |
1037 | 1079 | ||
1038 | D_ASSERT(digest_size == di->digest_size); | 1080 | D_ASSERT(digest_size == di->digest_size); |
1039 | eq = !memcmp(digest, di->digest, digest_size); | 1081 | eq = !memcmp(digest, di->digest, digest_size); |
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h index f93fa111ce50..defdb5013ea3 100644 --- a/drivers/block/drbd/drbd_wrappers.h +++ b/drivers/block/drbd/drbd_wrappers.h | |||
@@ -18,23 +18,9 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, | |||
18 | 18 | ||
19 | #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) | 19 | #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) |
20 | 20 | ||
21 | static inline int drbd_bio_has_active_page(struct bio *bio) | ||
22 | { | ||
23 | struct bio_vec *bvec; | ||
24 | int i; | ||
25 | |||
26 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
27 | if (page_count(bvec->bv_page) > 1) | ||
28 | return 1; | ||
29 | } | ||
30 | |||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | /* bi_end_io handlers */ | 21 | /* bi_end_io handlers */ |
35 | extern void drbd_md_io_complete(struct bio *bio, int error); | 22 | extern void drbd_md_io_complete(struct bio *bio, int error); |
36 | extern void drbd_endio_read_sec(struct bio *bio, int error); | 23 | extern void drbd_endio_sec(struct bio *bio, int error); |
37 | extern void drbd_endio_write_sec(struct bio *bio, int error); | ||
38 | extern void drbd_endio_pri(struct bio *bio, int error); | 24 | extern void drbd_endio_pri(struct bio *bio, int error); |
39 | 25 | ||
40 | /* | 26 | /* |
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 3b128dce9c3a..33d65039cce9 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c | |||
@@ -407,32 +407,24 @@ static int ide_disk_get_capacity(ide_drive_t *drive) | |||
407 | return 0; | 407 | return 0; |
408 | } | 408 | } |
409 | 409 | ||
410 | static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity) | 410 | static void ide_disk_unlock_native_capacity(ide_drive_t *drive) |
411 | { | 411 | { |
412 | u64 set = min(capacity, drive->probed_capacity); | ||
413 | u16 *id = drive->id; | 412 | u16 *id = drive->id; |
414 | int lba48 = ata_id_lba48_enabled(id); | 413 | int lba48 = ata_id_lba48_enabled(id); |
415 | 414 | ||
416 | if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 || | 415 | if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 || |
417 | ata_id_hpa_enabled(id) == 0) | 416 | ata_id_hpa_enabled(id) == 0) |
418 | goto out; | 417 | return; |
419 | 418 | ||
420 | /* | 419 | /* |
421 | * according to the spec the SET MAX ADDRESS command shall be | 420 | * according to the spec the SET MAX ADDRESS command shall be |
422 | * immediately preceded by a READ NATIVE MAX ADDRESS command | 421 | * immediately preceded by a READ NATIVE MAX ADDRESS command |
423 | */ | 422 | */ |
424 | capacity = ide_disk_hpa_get_native_capacity(drive, lba48); | 423 | if (!ide_disk_hpa_get_native_capacity(drive, lba48)) |
425 | if (capacity == 0) | 424 | return; |
426 | goto out; | 425 | |
427 | 426 | if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48)) | |
428 | set = ide_disk_hpa_set_capacity(drive, set, lba48); | 427 | drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */ |
429 | if (set) { | ||
430 | /* needed for ->resume to disable HPA */ | ||
431 | drive->dev_flags |= IDE_DFLAG_NOHPA; | ||
432 | return set; | ||
433 | } | ||
434 | out: | ||
435 | return drive->capacity64; | ||
436 | } | 428 | } |
437 | 429 | ||
438 | static void idedisk_prepare_flush(struct request_queue *q, struct request *rq) | 430 | static void idedisk_prepare_flush(struct request_queue *q, struct request *rq) |
@@ -783,13 +775,13 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk, | |||
783 | } | 775 | } |
784 | 776 | ||
785 | const struct ide_disk_ops ide_ata_disk_ops = { | 777 | const struct ide_disk_ops ide_ata_disk_ops = { |
786 | .check = ide_disk_check, | 778 | .check = ide_disk_check, |
787 | .set_capacity = ide_disk_set_capacity, | 779 | .unlock_native_capacity = ide_disk_unlock_native_capacity, |
788 | .get_capacity = ide_disk_get_capacity, | 780 | .get_capacity = ide_disk_get_capacity, |
789 | .setup = ide_disk_setup, | 781 | .setup = ide_disk_setup, |
790 | .flush = ide_disk_flush, | 782 | .flush = ide_disk_flush, |
791 | .init_media = ide_disk_init_media, | 783 | .init_media = ide_disk_init_media, |
792 | .set_doorlock = ide_disk_set_doorlock, | 784 | .set_doorlock = ide_disk_set_doorlock, |
793 | .do_request = ide_do_rw_disk, | 785 | .do_request = ide_do_rw_disk, |
794 | .ioctl = ide_disk_ioctl, | 786 | .ioctl = ide_disk_ioctl, |
795 | }; | 787 | }; |
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c index c32d83996ae1..c102d23d9b38 100644 --- a/drivers/ide/ide-gd.c +++ b/drivers/ide/ide-gd.c | |||
@@ -288,17 +288,14 @@ static int ide_gd_media_changed(struct gendisk *disk) | |||
288 | return ret; | 288 | return ret; |
289 | } | 289 | } |
290 | 290 | ||
291 | static unsigned long long ide_gd_set_capacity(struct gendisk *disk, | 291 | static void ide_gd_unlock_native_capacity(struct gendisk *disk) |
292 | unsigned long long capacity) | ||
293 | { | 292 | { |
294 | struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); | 293 | struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); |
295 | ide_drive_t *drive = idkp->drive; | 294 | ide_drive_t *drive = idkp->drive; |
296 | const struct ide_disk_ops *disk_ops = drive->disk_ops; | 295 | const struct ide_disk_ops *disk_ops = drive->disk_ops; |
297 | 296 | ||
298 | if (disk_ops->set_capacity) | 297 | if (disk_ops->unlock_native_capacity) |
299 | return disk_ops->set_capacity(drive, capacity); | 298 | disk_ops->unlock_native_capacity(drive); |
300 | |||
301 | return drive->capacity64; | ||
302 | } | 299 | } |
303 | 300 | ||
304 | static int ide_gd_revalidate_disk(struct gendisk *disk) | 301 | static int ide_gd_revalidate_disk(struct gendisk *disk) |
@@ -329,7 +326,7 @@ static const struct block_device_operations ide_gd_ops = { | |||
329 | .locked_ioctl = ide_gd_ioctl, | 326 | .locked_ioctl = ide_gd_ioctl, |
330 | .getgeo = ide_gd_getgeo, | 327 | .getgeo = ide_gd_getgeo, |
331 | .media_changed = ide_gd_media_changed, | 328 | .media_changed = ide_gd_media_changed, |
332 | .set_capacity = ide_gd_set_capacity, | 329 | .unlock_native_capacity = ide_gd_unlock_native_capacity, |
333 | .revalidate_disk = ide_gd_revalidate_disk | 330 | .revalidate_disk = ide_gd_revalidate_disk |
334 | }; | 331 | }; |
335 | 332 | ||
diff --git a/fs/block_dev.c b/fs/block_dev.c index 6dcee88c2e5d..55dcb7884f4d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -417,7 +417,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) | |||
417 | */ | 417 | */ |
418 | mutex_unlock(&bd_inode->i_mutex); | 418 | mutex_unlock(&bd_inode->i_mutex); |
419 | 419 | ||
420 | error = blkdev_issue_flush(bdev, NULL); | 420 | error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); |
421 | if (error == -EOPNOTSUPP) | 421 | if (error == -EOPNOTSUPP) |
422 | error = 0; | 422 | error = 0; |
423 | 423 | ||
@@ -668,41 +668,209 @@ void bd_forget(struct inode *inode) | |||
668 | iput(bdev->bd_inode); | 668 | iput(bdev->bd_inode); |
669 | } | 669 | } |
670 | 670 | ||
671 | int bd_claim(struct block_device *bdev, void *holder) | 671 | /** |
672 | * bd_may_claim - test whether a block device can be claimed | ||
673 | * @bdev: block device of interest | ||
674 | * @whole: whole block device containing @bdev, may equal @bdev | ||
675 | * @holder: holder trying to claim @bdev | ||
676 | * | ||
677 | * Test whther @bdev can be claimed by @holder. | ||
678 | * | ||
679 | * CONTEXT: | ||
680 | * spin_lock(&bdev_lock). | ||
681 | * | ||
682 | * RETURNS: | ||
683 | * %true if @bdev can be claimed, %false otherwise. | ||
684 | */ | ||
685 | static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, | ||
686 | void *holder) | ||
672 | { | 687 | { |
673 | int res; | ||
674 | spin_lock(&bdev_lock); | ||
675 | |||
676 | /* first decide result */ | ||
677 | if (bdev->bd_holder == holder) | 688 | if (bdev->bd_holder == holder) |
678 | res = 0; /* already a holder */ | 689 | return true; /* already a holder */ |
679 | else if (bdev->bd_holder != NULL) | 690 | else if (bdev->bd_holder != NULL) |
680 | res = -EBUSY; /* held by someone else */ | 691 | return false; /* held by someone else */ |
681 | else if (bdev->bd_contains == bdev) | 692 | else if (bdev->bd_contains == bdev) |
682 | res = 0; /* is a whole device which isn't held */ | 693 | return true; /* is a whole device which isn't held */ |
683 | 694 | ||
684 | else if (bdev->bd_contains->bd_holder == bd_claim) | 695 | else if (whole->bd_holder == bd_claim) |
685 | res = 0; /* is a partition of a device that is being partitioned */ | 696 | return true; /* is a partition of a device that is being partitioned */ |
686 | else if (bdev->bd_contains->bd_holder != NULL) | 697 | else if (whole->bd_holder != NULL) |
687 | res = -EBUSY; /* is a partition of a held device */ | 698 | return false; /* is a partition of a held device */ |
688 | else | 699 | else |
689 | res = 0; /* is a partition of an un-held device */ | 700 | return true; /* is a partition of an un-held device */ |
701 | } | ||
702 | |||
703 | /** | ||
704 | * bd_prepare_to_claim - prepare to claim a block device | ||
705 | * @bdev: block device of interest | ||
706 | * @whole: the whole device containing @bdev, may equal @bdev | ||
707 | * @holder: holder trying to claim @bdev | ||
708 | * | ||
709 | * Prepare to claim @bdev. This function fails if @bdev is already | ||
710 | * claimed by another holder and waits if another claiming is in | ||
711 | * progress. This function doesn't actually claim. On successful | ||
712 | * return, the caller has ownership of bd_claiming and bd_holder[s]. | ||
713 | * | ||
714 | * CONTEXT: | ||
715 | * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab | ||
716 | * it multiple times. | ||
717 | * | ||
718 | * RETURNS: | ||
719 | * 0 if @bdev can be claimed, -EBUSY otherwise. | ||
720 | */ | ||
721 | static int bd_prepare_to_claim(struct block_device *bdev, | ||
722 | struct block_device *whole, void *holder) | ||
723 | { | ||
724 | retry: | ||
725 | /* if someone else claimed, fail */ | ||
726 | if (!bd_may_claim(bdev, whole, holder)) | ||
727 | return -EBUSY; | ||
728 | |||
729 | /* if someone else is claiming, wait for it to finish */ | ||
730 | if (whole->bd_claiming && whole->bd_claiming != holder) { | ||
731 | wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); | ||
732 | DEFINE_WAIT(wait); | ||
733 | |||
734 | prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); | ||
735 | spin_unlock(&bdev_lock); | ||
736 | schedule(); | ||
737 | finish_wait(wq, &wait); | ||
738 | spin_lock(&bdev_lock); | ||
739 | goto retry; | ||
740 | } | ||
741 | |||
742 | /* yay, all mine */ | ||
743 | return 0; | ||
744 | } | ||
745 | |||
746 | /** | ||
747 | * bd_start_claiming - start claiming a block device | ||
748 | * @bdev: block device of interest | ||
749 | * @holder: holder trying to claim @bdev | ||
750 | * | ||
751 | * @bdev is about to be opened exclusively. Check @bdev can be opened | ||
752 | * exclusively and mark that an exclusive open is in progress. Each | ||
753 | * successful call to this function must be matched with a call to | ||
754 | * either bd_claim() or bd_abort_claiming(). If this function | ||
755 | * succeeds, the matching bd_claim() is guaranteed to succeed. | ||
756 | * | ||
757 | * CONTEXT: | ||
758 | * Might sleep. | ||
759 | * | ||
760 | * RETURNS: | ||
761 | * Pointer to the block device containing @bdev on success, ERR_PTR() | ||
762 | * value on failure. | ||
763 | */ | ||
764 | static struct block_device *bd_start_claiming(struct block_device *bdev, | ||
765 | void *holder) | ||
766 | { | ||
767 | struct gendisk *disk; | ||
768 | struct block_device *whole; | ||
769 | int partno, err; | ||
770 | |||
771 | might_sleep(); | ||
772 | |||
773 | /* | ||
774 | * @bdev might not have been initialized properly yet, look up | ||
775 | * and grab the outer block device the hard way. | ||
776 | */ | ||
777 | disk = get_gendisk(bdev->bd_dev, &partno); | ||
778 | if (!disk) | ||
779 | return ERR_PTR(-ENXIO); | ||
780 | |||
781 | whole = bdget_disk(disk, 0); | ||
782 | put_disk(disk); | ||
783 | if (!whole) | ||
784 | return ERR_PTR(-ENOMEM); | ||
785 | |||
786 | /* prepare to claim, if successful, mark claiming in progress */ | ||
787 | spin_lock(&bdev_lock); | ||
788 | |||
789 | err = bd_prepare_to_claim(bdev, whole, holder); | ||
790 | if (err == 0) { | ||
791 | whole->bd_claiming = holder; | ||
792 | spin_unlock(&bdev_lock); | ||
793 | return whole; | ||
794 | } else { | ||
795 | spin_unlock(&bdev_lock); | ||
796 | bdput(whole); | ||
797 | return ERR_PTR(err); | ||
798 | } | ||
799 | } | ||
690 | 800 | ||
691 | /* now impose change */ | 801 | /* releases bdev_lock */ |
692 | if (res==0) { | 802 | static void __bd_abort_claiming(struct block_device *whole, void *holder) |
803 | { | ||
804 | BUG_ON(whole->bd_claiming != holder); | ||
805 | whole->bd_claiming = NULL; | ||
806 | wake_up_bit(&whole->bd_claiming, 0); | ||
807 | |||
808 | spin_unlock(&bdev_lock); | ||
809 | bdput(whole); | ||
810 | } | ||
811 | |||
812 | /** | ||
813 | * bd_abort_claiming - abort claiming a block device | ||
814 | * @whole: whole block device returned by bd_start_claiming() | ||
815 | * @holder: holder trying to claim @bdev | ||
816 | * | ||
817 | * Abort a claiming block started by bd_start_claiming(). Note that | ||
818 | * @whole is not the block device to be claimed but the whole device | ||
819 | * returned by bd_start_claiming(). | ||
820 | * | ||
821 | * CONTEXT: | ||
822 | * Grabs and releases bdev_lock. | ||
823 | */ | ||
824 | static void bd_abort_claiming(struct block_device *whole, void *holder) | ||
825 | { | ||
826 | spin_lock(&bdev_lock); | ||
827 | __bd_abort_claiming(whole, holder); /* releases bdev_lock */ | ||
828 | } | ||
829 | |||
830 | /** | ||
831 | * bd_claim - claim a block device | ||
832 | * @bdev: block device to claim | ||
833 | * @holder: holder trying to claim @bdev | ||
834 | * | ||
835 | * Try to claim @bdev which must have been opened successfully. This | ||
836 | * function may be called with or without preceding | ||
837 | * blk_start_claiming(). In the former case, this function is always | ||
838 | * successful and terminates the claiming block. | ||
839 | * | ||
840 | * CONTEXT: | ||
841 | * Might sleep. | ||
842 | * | ||
843 | * RETURNS: | ||
844 | * 0 if successful, -EBUSY if @bdev is already claimed. | ||
845 | */ | ||
846 | int bd_claim(struct block_device *bdev, void *holder) | ||
847 | { | ||
848 | struct block_device *whole = bdev->bd_contains; | ||
849 | int res; | ||
850 | |||
851 | might_sleep(); | ||
852 | |||
853 | spin_lock(&bdev_lock); | ||
854 | |||
855 | res = bd_prepare_to_claim(bdev, whole, holder); | ||
856 | if (res == 0) { | ||
693 | /* note that for a whole device bd_holders | 857 | /* note that for a whole device bd_holders |
694 | * will be incremented twice, and bd_holder will | 858 | * will be incremented twice, and bd_holder will |
695 | * be set to bd_claim before being set to holder | 859 | * be set to bd_claim before being set to holder |
696 | */ | 860 | */ |
697 | bdev->bd_contains->bd_holders ++; | 861 | whole->bd_holders++; |
698 | bdev->bd_contains->bd_holder = bd_claim; | 862 | whole->bd_holder = bd_claim; |
699 | bdev->bd_holders++; | 863 | bdev->bd_holders++; |
700 | bdev->bd_holder = holder; | 864 | bdev->bd_holder = holder; |
701 | } | 865 | } |
702 | spin_unlock(&bdev_lock); | 866 | |
867 | if (whole->bd_claiming) | ||
868 | __bd_abort_claiming(whole, holder); /* releases bdev_lock */ | ||
869 | else | ||
870 | spin_unlock(&bdev_lock); | ||
871 | |||
703 | return res; | 872 | return res; |
704 | } | 873 | } |
705 | |||
706 | EXPORT_SYMBOL(bd_claim); | 874 | EXPORT_SYMBOL(bd_claim); |
707 | 875 | ||
708 | void bd_release(struct block_device *bdev) | 876 | void bd_release(struct block_device *bdev) |
@@ -1316,6 +1484,7 @@ EXPORT_SYMBOL(blkdev_get); | |||
1316 | 1484 | ||
1317 | static int blkdev_open(struct inode * inode, struct file * filp) | 1485 | static int blkdev_open(struct inode * inode, struct file * filp) |
1318 | { | 1486 | { |
1487 | struct block_device *whole = NULL; | ||
1319 | struct block_device *bdev; | 1488 | struct block_device *bdev; |
1320 | int res; | 1489 | int res; |
1321 | 1490 | ||
@@ -1338,22 +1507,25 @@ static int blkdev_open(struct inode * inode, struct file * filp) | |||
1338 | if (bdev == NULL) | 1507 | if (bdev == NULL) |
1339 | return -ENOMEM; | 1508 | return -ENOMEM; |
1340 | 1509 | ||
1510 | if (filp->f_mode & FMODE_EXCL) { | ||
1511 | whole = bd_start_claiming(bdev, filp); | ||
1512 | if (IS_ERR(whole)) { | ||
1513 | bdput(bdev); | ||
1514 | return PTR_ERR(whole); | ||
1515 | } | ||
1516 | } | ||
1517 | |||
1341 | filp->f_mapping = bdev->bd_inode->i_mapping; | 1518 | filp->f_mapping = bdev->bd_inode->i_mapping; |
1342 | 1519 | ||
1343 | res = blkdev_get(bdev, filp->f_mode); | 1520 | res = blkdev_get(bdev, filp->f_mode); |
1344 | if (res) | ||
1345 | return res; | ||
1346 | 1521 | ||
1347 | if (filp->f_mode & FMODE_EXCL) { | 1522 | if (whole) { |
1348 | res = bd_claim(bdev, filp); | 1523 | if (res == 0) |
1349 | if (res) | 1524 | BUG_ON(bd_claim(bdev, filp) != 0); |
1350 | goto out_blkdev_put; | 1525 | else |
1526 | bd_abort_claiming(whole, filp); | ||
1351 | } | 1527 | } |
1352 | 1528 | ||
1353 | return 0; | ||
1354 | |||
1355 | out_blkdev_put: | ||
1356 | blkdev_put(bdev, filp->f_mode); | ||
1357 | return res; | 1529 | return res; |
1358 | } | 1530 | } |
1359 | 1531 | ||
@@ -1564,27 +1736,34 @@ EXPORT_SYMBOL(lookup_bdev); | |||
1564 | */ | 1736 | */ |
1565 | struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) | 1737 | struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) |
1566 | { | 1738 | { |
1567 | struct block_device *bdev; | 1739 | struct block_device *bdev, *whole; |
1568 | int error = 0; | 1740 | int error; |
1569 | 1741 | ||
1570 | bdev = lookup_bdev(path); | 1742 | bdev = lookup_bdev(path); |
1571 | if (IS_ERR(bdev)) | 1743 | if (IS_ERR(bdev)) |
1572 | return bdev; | 1744 | return bdev; |
1573 | 1745 | ||
1746 | whole = bd_start_claiming(bdev, holder); | ||
1747 | if (IS_ERR(whole)) { | ||
1748 | bdput(bdev); | ||
1749 | return whole; | ||
1750 | } | ||
1751 | |||
1574 | error = blkdev_get(bdev, mode); | 1752 | error = blkdev_get(bdev, mode); |
1575 | if (error) | 1753 | if (error) |
1576 | return ERR_PTR(error); | 1754 | goto out_abort_claiming; |
1755 | |||
1577 | error = -EACCES; | 1756 | error = -EACCES; |
1578 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) | 1757 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) |
1579 | goto blkdev_put; | 1758 | goto out_blkdev_put; |
1580 | error = bd_claim(bdev, holder); | ||
1581 | if (error) | ||
1582 | goto blkdev_put; | ||
1583 | 1759 | ||
1760 | BUG_ON(bd_claim(bdev, holder) != 0); | ||
1584 | return bdev; | 1761 | return bdev; |
1585 | 1762 | ||
1586 | blkdev_put: | 1763 | out_blkdev_put: |
1587 | blkdev_put(bdev, mode); | 1764 | blkdev_put(bdev, mode); |
1765 | out_abort_claiming: | ||
1766 | bd_abort_claiming(whole, holder); | ||
1588 | return ERR_PTR(error); | 1767 | return ERR_PTR(error); |
1589 | } | 1768 | } |
1590 | 1769 | ||
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b34d32fdaaec..c6a4f459ad76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev, | |||
1589 | u64 start, u64 len) | 1589 | u64 start, u64 len) |
1590 | { | 1590 | { |
1591 | blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, | 1591 | blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, |
1592 | DISCARD_FL_BARRIER); | 1592 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); |
1593 | } | 1593 | } |
1594 | 1594 | ||
1595 | static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | 1595 | static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, |
diff --git a/fs/buffer.c b/fs/buffer.c index c9c266db0624..08e422d56996 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev) | |||
275 | return; | 275 | return; |
276 | 276 | ||
277 | invalidate_bh_lrus(); | 277 | invalidate_bh_lrus(); |
278 | lru_add_drain_all(); /* make sure all lru add caches are flushed */ | ||
278 | invalidate_mapping_pages(mapping, 0, -1); | 279 | invalidate_mapping_pages(mapping, 0, -1); |
279 | } | 280 | } |
280 | EXPORT_SYMBOL(invalidate_bdev); | 281 | EXPORT_SYMBOL(invalidate_bdev); |
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 26289e8f4163..fcf7487734b6 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c | |||
@@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) | |||
90 | * storage | 90 | * storage |
91 | */ | 91 | */ |
92 | if (needs_barrier) | 92 | if (needs_barrier) |
93 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 93 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, |
94 | BLKDEV_IFL_WAIT); | ||
94 | return ret; | 95 | return ret; |
95 | } | 96 | } |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0d0c3239c1cd..ef3d980e67cb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | |||
100 | if (ext4_should_writeback_data(inode) && | 100 | if (ext4_should_writeback_data(inode) && |
101 | (journal->j_fs_dev != journal->j_dev) && | 101 | (journal->j_fs_dev != journal->j_dev) && |
102 | (journal->j_flags & JBD2_BARRIER)) | 102 | (journal->j_flags & JBD2_BARRIER)) |
103 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 103 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, |
104 | NULL, BLKDEV_IFL_WAIT); | ||
104 | jbd2_log_wait_commit(journal, commit_tid); | 105 | jbd2_log_wait_commit(journal, commit_tid); |
105 | } else if (journal->j_flags & JBD2_BARRIER) | 106 | } else if (journal->j_flags & JBD2_BARRIER) |
106 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 107 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, |
108 | BLKDEV_IFL_WAIT); | ||
107 | return ret; | 109 | return ret; |
108 | } | 110 | } |
diff --git a/fs/fcntl.c b/fs/fcntl.c index 0a140741b39e..f74d270ba155 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/dnotify.h> | 14 | #include <linux/dnotify.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/pipe_fs_i.h> | ||
17 | #include <linux/security.h> | 18 | #include <linux/security.h> |
18 | #include <linux/ptrace.h> | 19 | #include <linux/ptrace.h> |
19 | #include <linux/signal.h> | 20 | #include <linux/signal.h> |
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, | |||
412 | case F_NOTIFY: | 413 | case F_NOTIFY: |
413 | err = fcntl_dirnotify(fd, filp, arg); | 414 | err = fcntl_dirnotify(fd, filp, arg); |
414 | break; | 415 | break; |
416 | case F_SETPIPE_SZ: | ||
417 | case F_GETPIPE_SZ: | ||
418 | err = pipe_fcntl(filp, cmd, arg); | ||
419 | break; | ||
415 | default: | 420 | default: |
416 | break; | 421 | break; |
417 | } | 422 | } |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b37f7cea4dd..437a7431b4ea 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -45,6 +45,7 @@ struct wb_writeback_args { | |||
45 | int for_kupdate:1; | 45 | int for_kupdate:1; |
46 | int range_cyclic:1; | 46 | int range_cyclic:1; |
47 | int for_background:1; | 47 | int for_background:1; |
48 | int sb_pinned:1; | ||
48 | }; | 49 | }; |
49 | 50 | ||
50 | /* | 51 | /* |
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) | |||
192 | } | 193 | } |
193 | 194 | ||
194 | static void bdi_alloc_queue_work(struct backing_dev_info *bdi, | 195 | static void bdi_alloc_queue_work(struct backing_dev_info *bdi, |
195 | struct wb_writeback_args *args) | 196 | struct wb_writeback_args *args, |
197 | int wait) | ||
196 | { | 198 | { |
197 | struct bdi_work *work; | 199 | struct bdi_work *work; |
198 | 200 | ||
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, | |||
204 | if (work) { | 206 | if (work) { |
205 | bdi_work_init(work, args); | 207 | bdi_work_init(work, args); |
206 | bdi_queue_work(bdi, work); | 208 | bdi_queue_work(bdi, work); |
209 | if (wait) | ||
210 | bdi_wait_on_work_clear(work); | ||
207 | } else { | 211 | } else { |
208 | struct bdi_writeback *wb = &bdi->wb; | 212 | struct bdi_writeback *wb = &bdi->wb; |
209 | 213 | ||
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, | |||
230 | .sync_mode = WB_SYNC_ALL, | 234 | .sync_mode = WB_SYNC_ALL, |
231 | .nr_pages = LONG_MAX, | 235 | .nr_pages = LONG_MAX, |
232 | .range_cyclic = 0, | 236 | .range_cyclic = 0, |
237 | /* | ||
238 | * Setting sb_pinned is not necessary for WB_SYNC_ALL, but | ||
239 | * lets make it explicitly clear. | ||
240 | */ | ||
241 | .sb_pinned = 1, | ||
233 | }; | 242 | }; |
234 | struct bdi_work work; | 243 | struct bdi_work work; |
235 | 244 | ||
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, | |||
245 | * @bdi: the backing device to write from | 254 | * @bdi: the backing device to write from |
246 | * @sb: write inodes from this super_block | 255 | * @sb: write inodes from this super_block |
247 | * @nr_pages: the number of pages to write | 256 | * @nr_pages: the number of pages to write |
257 | * @sb_locked: caller already holds sb umount sem. | ||
248 | * | 258 | * |
249 | * Description: | 259 | * Description: |
250 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only | 260 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only |
251 | * started when this function returns, we make no guarentees on | 261 | * started when this function returns, we make no guarentees on |
252 | * completion. Caller need not hold sb s_umount semaphore. | 262 | * completion. Caller specifies whether sb umount sem is held already or not. |
253 | * | 263 | * |
254 | */ | 264 | */ |
255 | void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, | 265 | void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, |
256 | long nr_pages) | 266 | long nr_pages, int sb_locked) |
257 | { | 267 | { |
258 | struct wb_writeback_args args = { | 268 | struct wb_writeback_args args = { |
259 | .sb = sb, | 269 | .sb = sb, |
260 | .sync_mode = WB_SYNC_NONE, | 270 | .sync_mode = WB_SYNC_NONE, |
261 | .nr_pages = nr_pages, | 271 | .nr_pages = nr_pages, |
262 | .range_cyclic = 1, | 272 | .range_cyclic = 1, |
273 | .sb_pinned = sb_locked, | ||
263 | }; | 274 | }; |
264 | 275 | ||
265 | /* | 276 | /* |
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, | |||
271 | args.for_background = 1; | 282 | args.for_background = 1; |
272 | } | 283 | } |
273 | 284 | ||
274 | bdi_alloc_queue_work(bdi, &args); | 285 | bdi_alloc_queue_work(bdi, &args, sb_locked); |
275 | } | 286 | } |
276 | 287 | ||
277 | /* | 288 | /* |
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
452 | 463 | ||
453 | BUG_ON(inode->i_state & I_SYNC); | 464 | BUG_ON(inode->i_state & I_SYNC); |
454 | 465 | ||
455 | /* Set I_SYNC, reset I_DIRTY */ | 466 | /* Set I_SYNC, reset I_DIRTY_PAGES */ |
456 | dirty = inode->i_state & I_DIRTY; | ||
457 | inode->i_state |= I_SYNC; | 467 | inode->i_state |= I_SYNC; |
458 | inode->i_state &= ~I_DIRTY; | 468 | inode->i_state &= ~I_DIRTY_PAGES; |
459 | |||
460 | spin_unlock(&inode_lock); | 469 | spin_unlock(&inode_lock); |
461 | 470 | ||
462 | ret = do_writepages(mapping, wbc); | 471 | ret = do_writepages(mapping, wbc); |
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
472 | ret = err; | 481 | ret = err; |
473 | } | 482 | } |
474 | 483 | ||
484 | /* | ||
485 | * Some filesystems may redirty the inode during the writeback | ||
486 | * due to delalloc, clear dirty metadata flags right before | ||
487 | * write_inode() | ||
488 | */ | ||
489 | spin_lock(&inode_lock); | ||
490 | dirty = inode->i_state & I_DIRTY; | ||
491 | inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); | ||
492 | spin_unlock(&inode_lock); | ||
475 | /* Don't write the inode if only I_DIRTY_PAGES was set */ | 493 | /* Don't write the inode if only I_DIRTY_PAGES was set */ |
476 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 494 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
477 | int err = write_inode(inode, wbc); | 495 | int err = write_inode(inode, wbc); |
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, | |||
577 | /* | 595 | /* |
578 | * Caller must already hold the ref for this | 596 | * Caller must already hold the ref for this |
579 | */ | 597 | */ |
580 | if (wbc->sync_mode == WB_SYNC_ALL) { | 598 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { |
581 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 599 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
582 | return SB_NOT_PINNED; | 600 | return SB_NOT_PINNED; |
583 | } | 601 | } |
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
751 | .for_kupdate = args->for_kupdate, | 769 | .for_kupdate = args->for_kupdate, |
752 | .for_background = args->for_background, | 770 | .for_background = args->for_background, |
753 | .range_cyclic = args->range_cyclic, | 771 | .range_cyclic = args->range_cyclic, |
772 | .sb_pinned = args->sb_pinned, | ||
754 | }; | 773 | }; |
755 | unsigned long oldest_jif; | 774 | unsigned long oldest_jif; |
756 | long wrote = 0; | 775 | long wrote = 0; |
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) | |||
852 | unsigned long expired; | 871 | unsigned long expired; |
853 | long nr_pages; | 872 | long nr_pages; |
854 | 873 | ||
874 | /* | ||
875 | * When set to zero, disable periodic writeback | ||
876 | */ | ||
877 | if (!dirty_writeback_interval) | ||
878 | return 0; | ||
879 | |||
855 | expired = wb->last_old_flush + | 880 | expired = wb->last_old_flush + |
856 | msecs_to_jiffies(dirty_writeback_interval * 10); | 881 | msecs_to_jiffies(dirty_writeback_interval * 10); |
857 | if (time_before(jiffies, expired)) | 882 | if (time_before(jiffies, expired)) |
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
887 | 912 | ||
888 | while ((work = get_next_work_item(bdi, wb)) != NULL) { | 913 | while ((work = get_next_work_item(bdi, wb)) != NULL) { |
889 | struct wb_writeback_args args = work->args; | 914 | struct wb_writeback_args args = work->args; |
915 | int post_clear; | ||
890 | 916 | ||
891 | /* | 917 | /* |
892 | * Override sync mode, in case we must wait for completion | 918 | * Override sync mode, in case we must wait for completion |
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
894 | if (force_wait) | 920 | if (force_wait) |
895 | work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; | 921 | work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; |
896 | 922 | ||
923 | post_clear = WB_SYNC_ALL || args.sb_pinned; | ||
924 | |||
897 | /* | 925 | /* |
898 | * If this isn't a data integrity operation, just notify | 926 | * If this isn't a data integrity operation, just notify |
899 | * that we have seen this work and we are now starting it. | 927 | * that we have seen this work and we are now starting it. |
900 | */ | 928 | */ |
901 | if (args.sync_mode == WB_SYNC_NONE) | 929 | if (!post_clear) |
902 | wb_clear_pending(wb, work); | 930 | wb_clear_pending(wb, work); |
903 | 931 | ||
904 | wrote += wb_writeback(wb, &args); | 932 | wrote += wb_writeback(wb, &args); |
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
907 | * This is a data integrity writeback, so only do the | 935 | * This is a data integrity writeback, so only do the |
908 | * notification when we have completed the work. | 936 | * notification when we have completed the work. |
909 | */ | 937 | */ |
910 | if (args.sync_mode == WB_SYNC_ALL) | 938 | if (post_clear) |
911 | wb_clear_pending(wb, work); | 939 | wb_clear_pending(wb, work); |
912 | } | 940 | } |
913 | 941 | ||
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb) | |||
947 | break; | 975 | break; |
948 | } | 976 | } |
949 | 977 | ||
950 | wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); | 978 | if (dirty_writeback_interval) { |
951 | schedule_timeout_interruptible(wait_jiffies); | 979 | wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); |
980 | schedule_timeout_interruptible(wait_jiffies); | ||
981 | } else { | ||
982 | set_current_state(TASK_INTERRUPTIBLE); | ||
983 | if (list_empty_careful(&wb->bdi->work_list) && | ||
984 | !kthread_should_stop()) | ||
985 | schedule(); | ||
986 | __set_current_state(TASK_RUNNING); | ||
987 | } | ||
988 | |||
952 | try_to_freeze(); | 989 | try_to_freeze(); |
953 | } | 990 | } |
954 | 991 | ||
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages) | |||
974 | if (!bdi_has_dirty_io(bdi)) | 1011 | if (!bdi_has_dirty_io(bdi)) |
975 | continue; | 1012 | continue; |
976 | 1013 | ||
977 | bdi_alloc_queue_work(bdi, &args); | 1014 | bdi_alloc_queue_work(bdi, &args, 0); |
978 | } | 1015 | } |
979 | 1016 | ||
980 | rcu_read_unlock(); | 1017 | rcu_read_unlock(); |
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb) | |||
1183 | iput(old_inode); | 1220 | iput(old_inode); |
1184 | } | 1221 | } |
1185 | 1222 | ||
1223 | static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) | ||
1224 | { | ||
1225 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); | ||
1226 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); | ||
1227 | long nr_to_write; | ||
1228 | |||
1229 | nr_to_write = nr_dirty + nr_unstable + | ||
1230 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | ||
1231 | |||
1232 | bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); | ||
1233 | } | ||
1234 | |||
1186 | /** | 1235 | /** |
1187 | * writeback_inodes_sb - writeback dirty inodes from given super_block | 1236 | * writeback_inodes_sb - writeback dirty inodes from given super_block |
1188 | * @sb: the superblock | 1237 | * @sb: the superblock |
@@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb) | |||
1194 | */ | 1243 | */ |
1195 | void writeback_inodes_sb(struct super_block *sb) | 1244 | void writeback_inodes_sb(struct super_block *sb) |
1196 | { | 1245 | { |
1197 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); | 1246 | __writeback_inodes_sb(sb, 0); |
1198 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); | ||
1199 | long nr_to_write; | ||
1200 | |||
1201 | nr_to_write = nr_dirty + nr_unstable + | ||
1202 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | ||
1203 | |||
1204 | bdi_start_writeback(sb->s_bdi, sb, nr_to_write); | ||
1205 | } | 1247 | } |
1206 | EXPORT_SYMBOL(writeback_inodes_sb); | 1248 | EXPORT_SYMBOL(writeback_inodes_sb); |
1207 | 1249 | ||
1208 | /** | 1250 | /** |
1251 | * writeback_inodes_sb_locked - writeback dirty inodes from given super_block | ||
1252 | * @sb: the superblock | ||
1253 | * | ||
1254 | * Like writeback_inodes_sb(), except the caller already holds the | ||
1255 | * sb umount sem. | ||
1256 | */ | ||
1257 | void writeback_inodes_sb_locked(struct super_block *sb) | ||
1258 | { | ||
1259 | __writeback_inodes_sb(sb, 1); | ||
1260 | } | ||
1261 | |||
1262 | /** | ||
1209 | * writeback_inodes_sb_if_idle - start writeback if none underway | 1263 | * writeback_inodes_sb_if_idle - start writeback if none underway |
1210 | * @sb: the superblock | 1264 | * @sb: the superblock |
1211 | * | 1265 | * |
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8bce73ed4d8e..117fa4171f62 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c | |||
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, | |||
854 | if ((start + nr_sects) != blk) { | 854 | if ((start + nr_sects) != blk) { |
855 | rv = blkdev_issue_discard(bdev, start, | 855 | rv = blkdev_issue_discard(bdev, start, |
856 | nr_sects, GFP_NOFS, | 856 | nr_sects, GFP_NOFS, |
857 | DISCARD_FL_BARRIER); | 857 | BLKDEV_IFL_WAIT | |
858 | BLKDEV_IFL_BARRIER); | ||
858 | if (rv) | 859 | if (rv) |
859 | goto fail; | 860 | goto fail; |
860 | nr_sects = 0; | 861 | nr_sects = 0; |
@@ -869,7 +870,7 @@ start_new_extent: | |||
869 | } | 870 | } |
870 | if (nr_sects) { | 871 | if (nr_sects) { |
871 | rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, | 872 | rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, |
872 | DISCARD_FL_BARRIER); | 873 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); |
873 | if (rv) | 874 | if (rv) |
874 | goto fail; | 875 | goto fail; |
875 | } | 876 | } |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 30beb11ef928..076d1cc44f95 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) | |||
530 | */ | 530 | */ |
531 | if ((journal->j_fs_dev != journal->j_dev) && | 531 | if ((journal->j_fs_dev != journal->j_dev) && |
532 | (journal->j_flags & JBD2_BARRIER)) | 532 | (journal->j_flags & JBD2_BARRIER)) |
533 | blkdev_issue_flush(journal->j_fs_dev, NULL); | 533 | blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, |
534 | BLKDEV_IFL_WAIT); | ||
534 | if (!(journal->j_flags & JBD2_ABORT)) | 535 | if (!(journal->j_flags & JBD2_ABORT)) |
535 | jbd2_journal_update_superblock(journal, 1); | 536 | jbd2_journal_update_superblock(journal, 1); |
536 | return 0; | 537 | return 0; |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 671da7fb7ffd..75716d3d2be0 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -717,7 +717,8 @@ start_journal_io: | |||
717 | if (commit_transaction->t_flushed_data_blocks && | 717 | if (commit_transaction->t_flushed_data_blocks && |
718 | (journal->j_fs_dev != journal->j_dev) && | 718 | (journal->j_fs_dev != journal->j_dev) && |
719 | (journal->j_flags & JBD2_BARRIER)) | 719 | (journal->j_flags & JBD2_BARRIER)) |
720 | blkdev_issue_flush(journal->j_fs_dev, NULL); | 720 | blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, |
721 | BLKDEV_IFL_WAIT); | ||
721 | 722 | ||
722 | /* Done it all: now write the commit record asynchronously. */ | 723 | /* Done it all: now write the commit record asynchronously. */ |
723 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, | 724 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, |
@@ -727,7 +728,8 @@ start_journal_io: | |||
727 | if (err) | 728 | if (err) |
728 | __jbd2_journal_abort_hard(journal); | 729 | __jbd2_journal_abort_hard(journal); |
729 | if (journal->j_flags & JBD2_BARRIER) | 730 | if (journal->j_flags & JBD2_BARRIER) |
730 | blkdev_issue_flush(journal->j_dev, NULL); | 731 | blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, |
732 | BLKDEV_IFL_WAIT); | ||
731 | } | 733 | } |
732 | 734 | ||
733 | err = journal_finish_inode_data_buffers(journal, commit_transaction); | 735 | err = journal_finish_inode_data_buffers(journal, commit_transaction); |
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index a756168a21c2..8c1097327abc 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c | |||
@@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, | |||
674 | start * sects_per_block, | 674 | start * sects_per_block, |
675 | nblocks * sects_per_block, | 675 | nblocks * sects_per_block, |
676 | GFP_NOFS, | 676 | GFP_NOFS, |
677 | DISCARD_FL_BARRIER); | 677 | BLKDEV_IFL_BARRIER); |
678 | if (ret < 0) | 678 | if (ret < 0) |
679 | return ret; | 679 | return ret; |
680 | nblocks = 0; | 680 | nblocks = 0; |
@@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, | |||
684 | ret = blkdev_issue_discard(nilfs->ns_bdev, | 684 | ret = blkdev_issue_discard(nilfs->ns_bdev, |
685 | start * sects_per_block, | 685 | start * sects_per_block, |
686 | nblocks * sects_per_block, | 686 | nblocks * sects_per_block, |
687 | GFP_NOFS, DISCARD_FL_BARRIER); | 687 | GFP_NOFS, BLKDEV_IFL_BARRIER); |
688 | return ret; | 688 | return ret; |
689 | } | 689 | } |
690 | 690 | ||
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index a97b477ac0fc..6921e7890be6 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c | |||
@@ -70,14 +70,14 @@ struct riscix_record { | |||
70 | 70 | ||
71 | #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ | 71 | #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ |
72 | defined(CONFIG_ACORN_PARTITION_ADFS) | 72 | defined(CONFIG_ACORN_PARTITION_ADFS) |
73 | static int | 73 | static int riscix_partition(struct parsed_partitions *state, |
74 | riscix_partition(struct parsed_partitions *state, struct block_device *bdev, | 74 | unsigned long first_sect, int slot, |
75 | unsigned long first_sect, int slot, unsigned long nr_sects) | 75 | unsigned long nr_sects) |
76 | { | 76 | { |
77 | Sector sect; | 77 | Sector sect; |
78 | struct riscix_record *rr; | 78 | struct riscix_record *rr; |
79 | 79 | ||
80 | rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, §); | 80 | rr = read_part_sector(state, first_sect, §); |
81 | if (!rr) | 81 | if (!rr) |
82 | return -1; | 82 | return -1; |
83 | 83 | ||
@@ -123,9 +123,9 @@ struct linux_part { | |||
123 | 123 | ||
124 | #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ | 124 | #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ |
125 | defined(CONFIG_ACORN_PARTITION_ADFS) | 125 | defined(CONFIG_ACORN_PARTITION_ADFS) |
126 | static int | 126 | static int linux_partition(struct parsed_partitions *state, |
127 | linux_partition(struct parsed_partitions *state, struct block_device *bdev, | 127 | unsigned long first_sect, int slot, |
128 | unsigned long first_sect, int slot, unsigned long nr_sects) | 128 | unsigned long nr_sects) |
129 | { | 129 | { |
130 | Sector sect; | 130 | Sector sect; |
131 | struct linux_part *linuxp; | 131 | struct linux_part *linuxp; |
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, | |||
135 | 135 | ||
136 | put_partition(state, slot++, first_sect, size); | 136 | put_partition(state, slot++, first_sect, size); |
137 | 137 | ||
138 | linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, §); | 138 | linuxp = read_part_sector(state, first_sect, §); |
139 | if (!linuxp) | 139 | if (!linuxp) |
140 | return -1; | 140 | return -1; |
141 | 141 | ||
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, | |||
157 | #endif | 157 | #endif |
158 | 158 | ||
159 | #ifdef CONFIG_ACORN_PARTITION_CUMANA | 159 | #ifdef CONFIG_ACORN_PARTITION_CUMANA |
160 | int | 160 | int adfspart_check_CUMANA(struct parsed_partitions *state) |
161 | adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev) | ||
162 | { | 161 | { |
163 | unsigned long first_sector = 0; | 162 | unsigned long first_sector = 0; |
164 | unsigned int start_blk = 0; | 163 | unsigned int start_blk = 0; |
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev | |||
185 | struct adfs_discrecord *dr; | 184 | struct adfs_discrecord *dr; |
186 | unsigned int nr_sects; | 185 | unsigned int nr_sects; |
187 | 186 | ||
188 | data = read_dev_sector(bdev, start_blk * 2 + 6, §); | 187 | data = read_part_sector(state, start_blk * 2 + 6, §); |
189 | if (!data) | 188 | if (!data) |
190 | return -1; | 189 | return -1; |
191 | 190 | ||
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev | |||
217 | #ifdef CONFIG_ACORN_PARTITION_RISCIX | 216 | #ifdef CONFIG_ACORN_PARTITION_RISCIX |
218 | case PARTITION_RISCIX_SCSI: | 217 | case PARTITION_RISCIX_SCSI: |
219 | /* RISCiX - we don't know how to find the next one. */ | 218 | /* RISCiX - we don't know how to find the next one. */ |
220 | slot = riscix_partition(state, bdev, first_sector, | 219 | slot = riscix_partition(state, first_sector, slot, |
221 | slot, nr_sects); | 220 | nr_sects); |
222 | break; | 221 | break; |
223 | #endif | 222 | #endif |
224 | 223 | ||
225 | case PARTITION_LINUX: | 224 | case PARTITION_LINUX: |
226 | slot = linux_partition(state, bdev, first_sector, | 225 | slot = linux_partition(state, first_sector, slot, |
227 | slot, nr_sects); | 226 | nr_sects); |
228 | break; | 227 | break; |
229 | } | 228 | } |
230 | put_dev_sector(sect); | 229 | put_dev_sector(sect); |
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev | |||
249 | * hda1 = ADFS partition on first drive. | 248 | * hda1 = ADFS partition on first drive. |
250 | * hda2 = non-ADFS partition. | 249 | * hda2 = non-ADFS partition. |
251 | */ | 250 | */ |
252 | int | 251 | int adfspart_check_ADFS(struct parsed_partitions *state) |
253 | adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) | ||
254 | { | 252 | { |
255 | unsigned long start_sect, nr_sects, sectscyl, heads; | 253 | unsigned long start_sect, nr_sects, sectscyl, heads; |
256 | Sector sect; | 254 | Sector sect; |
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) | |||
259 | unsigned char id; | 257 | unsigned char id; |
260 | int slot = 1; | 258 | int slot = 1; |
261 | 259 | ||
262 | data = read_dev_sector(bdev, 6, §); | 260 | data = read_part_sector(state, 6, §); |
263 | if (!data) | 261 | if (!data) |
264 | return -1; | 262 | return -1; |
265 | 263 | ||
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) | |||
278 | /* | 276 | /* |
279 | * Work out start of non-adfs partition. | 277 | * Work out start of non-adfs partition. |
280 | */ | 278 | */ |
281 | nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; | 279 | nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; |
282 | 280 | ||
283 | if (start_sect) { | 281 | if (start_sect) { |
284 | switch (id) { | 282 | switch (id) { |
285 | #ifdef CONFIG_ACORN_PARTITION_RISCIX | 283 | #ifdef CONFIG_ACORN_PARTITION_RISCIX |
286 | case PARTITION_RISCIX_SCSI: | 284 | case PARTITION_RISCIX_SCSI: |
287 | case PARTITION_RISCIX_MFM: | 285 | case PARTITION_RISCIX_MFM: |
288 | slot = riscix_partition(state, bdev, start_sect, | 286 | slot = riscix_partition(state, start_sect, slot, |
289 | slot, nr_sects); | 287 | nr_sects); |
290 | break; | 288 | break; |
291 | #endif | 289 | #endif |
292 | 290 | ||
293 | case PARTITION_LINUX: | 291 | case PARTITION_LINUX: |
294 | slot = linux_partition(state, bdev, start_sect, | 292 | slot = linux_partition(state, start_sect, slot, |
295 | slot, nr_sects); | 293 | nr_sects); |
296 | break; | 294 | break; |
297 | } | 295 | } |
298 | } | 296 | } |
@@ -308,10 +306,11 @@ struct ics_part { | |||
308 | __le32 size; | 306 | __le32 size; |
309 | }; | 307 | }; |
310 | 308 | ||
311 | static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) | 309 | static int adfspart_check_ICSLinux(struct parsed_partitions *state, |
310 | unsigned long block) | ||
312 | { | 311 | { |
313 | Sector sect; | 312 | Sector sect; |
314 | unsigned char *data = read_dev_sector(bdev, block, §); | 313 | unsigned char *data = read_part_sector(state, block, §); |
315 | int result = 0; | 314 | int result = 0; |
316 | 315 | ||
317 | if (data) { | 316 | if (data) { |
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data) | |||
349 | * hda2 = ADFS partition 1 on first drive. | 348 | * hda2 = ADFS partition 1 on first drive. |
350 | * ..etc.. | 349 | * ..etc.. |
351 | */ | 350 | */ |
352 | int | 351 | int adfspart_check_ICS(struct parsed_partitions *state) |
353 | adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) | ||
354 | { | 352 | { |
355 | const unsigned char *data; | 353 | const unsigned char *data; |
356 | const struct ics_part *p; | 354 | const struct ics_part *p; |
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) | |||
360 | /* | 358 | /* |
361 | * Try ICS style partitions - sector 0 contains partition info. | 359 | * Try ICS style partitions - sector 0 contains partition info. |
362 | */ | 360 | */ |
363 | data = read_dev_sector(bdev, 0, §); | 361 | data = read_part_sector(state, 0, §); |
364 | if (!data) | 362 | if (!data) |
365 | return -1; | 363 | return -1; |
366 | 364 | ||
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) | |||
392 | * partition is. We must not make this visible | 390 | * partition is. We must not make this visible |
393 | * to the filesystem. | 391 | * to the filesystem. |
394 | */ | 392 | */ |
395 | if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { | 393 | if (size > 1 && adfspart_check_ICSLinux(state, start)) { |
396 | start += 1; | 394 | start += 1; |
397 | size -= 1; | 395 | size -= 1; |
398 | } | 396 | } |
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data) | |||
446 | * hda2 = ADFS partition 1 on first drive. | 444 | * hda2 = ADFS partition 1 on first drive. |
447 | * ..etc.. | 445 | * ..etc.. |
448 | */ | 446 | */ |
449 | int | 447 | int adfspart_check_POWERTEC(struct parsed_partitions *state) |
450 | adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev) | ||
451 | { | 448 | { |
452 | Sector sect; | 449 | Sector sect; |
453 | const unsigned char *data; | 450 | const unsigned char *data; |
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd | |||
455 | int slot = 1; | 452 | int slot = 1; |
456 | int i; | 453 | int i; |
457 | 454 | ||
458 | data = read_dev_sector(bdev, 0, §); | 455 | data = read_part_sector(state, 0, §); |
459 | if (!data) | 456 | if (!data) |
460 | return -1; | 457 | return -1; |
461 | 458 | ||
@@ -508,8 +505,7 @@ static const char eesox_name[] = { | |||
508 | * 1. The individual ADFS boot block entries that are placed on the disk. | 505 | * 1. The individual ADFS boot block entries that are placed on the disk. |
509 | * 2. The start address of the next entry. | 506 | * 2. The start address of the next entry. |
510 | */ | 507 | */ |
511 | int | 508 | int adfspart_check_EESOX(struct parsed_partitions *state) |
512 | adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) | ||
513 | { | 509 | { |
514 | Sector sect; | 510 | Sector sect; |
515 | const unsigned char *data; | 511 | const unsigned char *data; |
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) | |||
518 | sector_t start = 0; | 514 | sector_t start = 0; |
519 | int i, slot = 1; | 515 | int i, slot = 1; |
520 | 516 | ||
521 | data = read_dev_sector(bdev, 7, §); | 517 | data = read_part_sector(state, 7, §); |
522 | if (!data) | 518 | if (!data) |
523 | return -1; | 519 | return -1; |
524 | 520 | ||
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) | |||
545 | if (i != 0) { | 541 | if (i != 0) { |
546 | sector_t size; | 542 | sector_t size; |
547 | 543 | ||
548 | size = get_capacity(bdev->bd_disk); | 544 | size = get_capacity(state->bdev->bd_disk); |
549 | put_partition(state, slot++, start, size - start); | 545 | put_partition(state, slot++, start, size - start); |
550 | printk("\n"); | 546 | printk("\n"); |
551 | } | 547 | } |
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h index 81fd50ecc080..ede828529692 100644 --- a/fs/partitions/acorn.h +++ b/fs/partitions/acorn.h | |||
@@ -7,8 +7,8 @@ | |||
7 | * format, and everyone stick to it? | 7 | * format, and everyone stick to it? |
8 | */ | 8 | */ |
9 | 9 | ||
10 | int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); | 10 | int adfspart_check_CUMANA(struct parsed_partitions *state); |
11 | int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); | 11 | int adfspart_check_ADFS(struct parsed_partitions *state); |
12 | int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); | 12 | int adfspart_check_ICS(struct parsed_partitions *state); |
13 | int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); | 13 | int adfspart_check_POWERTEC(struct parsed_partitions *state); |
14 | int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); | 14 | int adfspart_check_EESOX(struct parsed_partitions *state); |
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c index 9917a8c360f2..ba443d4229f8 100644 --- a/fs/partitions/amiga.c +++ b/fs/partitions/amiga.c | |||
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size) | |||
23 | return sum; | 23 | return sum; |
24 | } | 24 | } |
25 | 25 | ||
26 | int | 26 | int amiga_partition(struct parsed_partitions *state) |
27 | amiga_partition(struct parsed_partitions *state, struct block_device *bdev) | ||
28 | { | 27 | { |
29 | Sector sect; | 28 | Sector sect; |
30 | unsigned char *data; | 29 | unsigned char *data; |
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
38 | for (blk = 0; ; blk++, put_dev_sector(sect)) { | 37 | for (blk = 0; ; blk++, put_dev_sector(sect)) { |
39 | if (blk == RDB_ALLOCATION_LIMIT) | 38 | if (blk == RDB_ALLOCATION_LIMIT) |
40 | goto rdb_done; | 39 | goto rdb_done; |
41 | data = read_dev_sector(bdev, blk, §); | 40 | data = read_part_sector(state, blk, §); |
42 | if (!data) { | 41 | if (!data) { |
43 | if (warn_no_part) | 42 | if (warn_no_part) |
44 | printk("Dev %s: unable to read RDB block %d\n", | 43 | printk("Dev %s: unable to read RDB block %d\n", |
45 | bdevname(bdev, b), blk); | 44 | bdevname(state->bdev, b), blk); |
46 | res = -1; | 45 | res = -1; |
47 | goto rdb_done; | 46 | goto rdb_done; |
48 | } | 47 | } |
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
64 | } | 63 | } |
65 | 64 | ||
66 | printk("Dev %s: RDB in block %d has bad checksum\n", | 65 | printk("Dev %s: RDB in block %d has bad checksum\n", |
67 | bdevname(bdev, b), blk); | 66 | bdevname(state->bdev, b), blk); |
68 | } | 67 | } |
69 | 68 | ||
70 | /* blksize is blocks per 512 byte standard block */ | 69 | /* blksize is blocks per 512 byte standard block */ |
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
75 | put_dev_sector(sect); | 74 | put_dev_sector(sect); |
76 | for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { | 75 | for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { |
77 | blk *= blksize; /* Read in terms partition table understands */ | 76 | blk *= blksize; /* Read in terms partition table understands */ |
78 | data = read_dev_sector(bdev, blk, §); | 77 | data = read_part_sector(state, blk, §); |
79 | if (!data) { | 78 | if (!data) { |
80 | if (warn_no_part) | 79 | if (warn_no_part) |
81 | printk("Dev %s: unable to read partition block %d\n", | 80 | printk("Dev %s: unable to read partition block %d\n", |
82 | bdevname(bdev, b), blk); | 81 | bdevname(state->bdev, b), blk); |
83 | res = -1; | 82 | res = -1; |
84 | goto rdb_done; | 83 | goto rdb_done; |
85 | } | 84 | } |
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h index 2f3e9ce22d53..d094585cadaa 100644 --- a/fs/partitions/amiga.h +++ b/fs/partitions/amiga.h | |||
@@ -2,5 +2,5 @@ | |||
2 | * fs/partitions/amiga.h | 2 | * fs/partitions/amiga.h |
3 | */ | 3 | */ |
4 | 4 | ||
5 | int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); | 5 | int amiga_partition(struct parsed_partitions *state); |
6 | 6 | ||
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c index 1f3572d5b755..4439ff1b6cec 100644 --- a/fs/partitions/atari.c +++ b/fs/partitions/atari.c | |||
@@ -30,7 +30,7 @@ static inline int OK_id(char *s) | |||
30 | memcmp (s, "RAW", 3) == 0 ; | 30 | memcmp (s, "RAW", 3) == 0 ; |
31 | } | 31 | } |
32 | 32 | ||
33 | int atari_partition(struct parsed_partitions *state, struct block_device *bdev) | 33 | int atari_partition(struct parsed_partitions *state) |
34 | { | 34 | { |
35 | Sector sect; | 35 | Sector sect; |
36 | struct rootsector *rs; | 36 | struct rootsector *rs; |
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
42 | int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ | 42 | int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ |
43 | #endif | 43 | #endif |
44 | 44 | ||
45 | rs = (struct rootsector *) read_dev_sector(bdev, 0, §); | 45 | rs = read_part_sector(state, 0, §); |
46 | if (!rs) | 46 | if (!rs) |
47 | return -1; | 47 | return -1; |
48 | 48 | ||
49 | /* Verify this is an Atari rootsector: */ | 49 | /* Verify this is an Atari rootsector: */ |
50 | hd_size = bdev->bd_inode->i_size >> 9; | 50 | hd_size = state->bdev->bd_inode->i_size >> 9; |
51 | if (!VALID_PARTITION(&rs->part[0], hd_size) && | 51 | if (!VALID_PARTITION(&rs->part[0], hd_size) && |
52 | !VALID_PARTITION(&rs->part[1], hd_size) && | 52 | !VALID_PARTITION(&rs->part[1], hd_size) && |
53 | !VALID_PARTITION(&rs->part[2], hd_size) && | 53 | !VALID_PARTITION(&rs->part[2], hd_size) && |
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
84 | printk(" XGM<"); | 84 | printk(" XGM<"); |
85 | partsect = extensect = be32_to_cpu(pi->st); | 85 | partsect = extensect = be32_to_cpu(pi->st); |
86 | while (1) { | 86 | while (1) { |
87 | xrs = (struct rootsector *)read_dev_sector(bdev, partsect, §2); | 87 | xrs = read_part_sector(state, partsect, §2); |
88 | if (!xrs) { | 88 | if (!xrs) { |
89 | printk (" block %ld read failed\n", partsect); | 89 | printk (" block %ld read failed\n", partsect); |
90 | put_dev_sector(sect); | 90 | put_dev_sector(sect); |
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h index 63186b00e135..fe2d32a89f36 100644 --- a/fs/partitions/atari.h +++ b/fs/partitions/atari.h | |||
@@ -31,4 +31,4 @@ struct rootsector | |||
31 | u16 checksum; /* checksum for bootable disks */ | 31 | u16 checksum; /* checksum for bootable disks */ |
32 | } __attribute__((__packed__)); | 32 | } __attribute__((__packed__)); |
33 | 33 | ||
34 | int atari_partition(struct parsed_partitions *state, struct block_device *bdev); | 34 | int atari_partition(struct parsed_partitions *state); |
diff --git a/fs/partitions/check.c b/fs/partitions/check.c index e238ab23a9e7..5dcd4b0c5533 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c | |||
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev); | |||
45 | 45 | ||
46 | int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ | 46 | int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ |
47 | 47 | ||
48 | static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { | 48 | static int (*check_part[])(struct parsed_partitions *) = { |
49 | /* | 49 | /* |
50 | * Probe partition formats with tables at disk address 0 | 50 | * Probe partition formats with tables at disk address 0 |
51 | * that also have an ADFS boot block at 0xdc0. | 51 | * that also have an ADFS boot block at 0xdc0. |
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev) | |||
161 | struct parsed_partitions *state; | 161 | struct parsed_partitions *state; |
162 | int i, res, err; | 162 | int i, res, err; |
163 | 163 | ||
164 | state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); | 164 | state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); |
165 | if (!state) | 165 | if (!state) |
166 | return NULL; | 166 | return NULL; |
167 | 167 | ||
168 | state->bdev = bdev; | ||
168 | disk_name(hd, 0, state->name); | 169 | disk_name(hd, 0, state->name); |
169 | printk(KERN_INFO " %s:", state->name); | 170 | printk(KERN_INFO " %s:", state->name); |
170 | if (isdigit(state->name[strlen(state->name)-1])) | 171 | if (isdigit(state->name[strlen(state->name)-1])) |
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) | |||
174 | i = res = err = 0; | 175 | i = res = err = 0; |
175 | while (!res && check_part[i]) { | 176 | while (!res && check_part[i]) { |
176 | memset(&state->parts, 0, sizeof(state->parts)); | 177 | memset(&state->parts, 0, sizeof(state->parts)); |
177 | res = check_part[i++](state, bdev); | 178 | res = check_part[i++](state); |
178 | if (res < 0) { | 179 | if (res < 0) { |
179 | /* We have hit an I/O error which we don't report now. | 180 | /* We have hit an I/O error which we don't report now. |
180 | * But record it, and let the others do their job. | 181 | * But record it, and let the others do their job. |
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev) | |||
186 | } | 187 | } |
187 | if (res > 0) | 188 | if (res > 0) |
188 | return state; | 189 | return state; |
190 | if (state->access_beyond_eod) | ||
191 | err = -ENOSPC; | ||
189 | if (err) | 192 | if (err) |
190 | /* The partition is unrecognized. So report I/O errors if there were any */ | 193 | /* The partition is unrecognized. So report I/O errors if there were any */ |
191 | res = err; | 194 | res = err; |
@@ -538,12 +541,33 @@ exit: | |||
538 | disk_part_iter_exit(&piter); | 541 | disk_part_iter_exit(&piter); |
539 | } | 542 | } |
540 | 543 | ||
544 | static bool disk_unlock_native_capacity(struct gendisk *disk) | ||
545 | { | ||
546 | const struct block_device_operations *bdops = disk->fops; | ||
547 | |||
548 | if (bdops->unlock_native_capacity && | ||
549 | !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { | ||
550 | printk(KERN_CONT "enabling native capacity\n"); | ||
551 | bdops->unlock_native_capacity(disk); | ||
552 | disk->flags |= GENHD_FL_NATIVE_CAPACITY; | ||
553 | return true; | ||
554 | } else { | ||
555 | printk(KERN_CONT "truncated\n"); | ||
556 | return false; | ||
557 | } | ||
558 | } | ||
559 | |||
541 | int rescan_partitions(struct gendisk *disk, struct block_device *bdev) | 560 | int rescan_partitions(struct gendisk *disk, struct block_device *bdev) |
542 | { | 561 | { |
562 | struct parsed_partitions *state = NULL; | ||
543 | struct disk_part_iter piter; | 563 | struct disk_part_iter piter; |
544 | struct hd_struct *part; | 564 | struct hd_struct *part; |
545 | struct parsed_partitions *state; | ||
546 | int p, highest, res; | 565 | int p, highest, res; |
566 | rescan: | ||
567 | if (state && !IS_ERR(state)) { | ||
568 | kfree(state); | ||
569 | state = NULL; | ||
570 | } | ||
547 | 571 | ||
548 | if (bdev->bd_part_count) | 572 | if (bdev->bd_part_count) |
549 | return -EBUSY; | 573 | return -EBUSY; |
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) | |||
562 | bdev->bd_invalidated = 0; | 586 | bdev->bd_invalidated = 0; |
563 | if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) | 587 | if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) |
564 | return 0; | 588 | return 0; |
565 | if (IS_ERR(state)) /* I/O error reading the partition table */ | 589 | if (IS_ERR(state)) { |
590 | /* | ||
591 | * I/O error reading the partition table. If any | ||
592 | * partition code tried to read beyond EOD, retry | ||
593 | * after unlocking native capacity. | ||
594 | */ | ||
595 | if (PTR_ERR(state) == -ENOSPC) { | ||
596 | printk(KERN_WARNING "%s: partition table beyond EOD, ", | ||
597 | disk->disk_name); | ||
598 | if (disk_unlock_native_capacity(disk)) | ||
599 | goto rescan; | ||
600 | } | ||
566 | return -EIO; | 601 | return -EIO; |
602 | } | ||
603 | /* | ||
604 | * If any partition code tried to read beyond EOD, try | ||
605 | * unlocking native capacity even if partition table is | ||
606 | * sucessfully read as we could be missing some partitions. | ||
607 | */ | ||
608 | if (state->access_beyond_eod) { | ||
609 | printk(KERN_WARNING | ||
610 | "%s: partition table partially beyond EOD, ", | ||
611 | disk->disk_name); | ||
612 | if (disk_unlock_native_capacity(disk)) | ||
613 | goto rescan; | ||
614 | } | ||
567 | 615 | ||
568 | /* tell userspace that the media / partition table may have changed */ | 616 | /* tell userspace that the media / partition table may have changed */ |
569 | kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); | 617 | kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); |
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) | |||
581 | /* add partitions */ | 629 | /* add partitions */ |
582 | for (p = 1; p < state->limit; p++) { | 630 | for (p = 1; p < state->limit; p++) { |
583 | sector_t size, from; | 631 | sector_t size, from; |
584 | try_scan: | 632 | |
585 | size = state->parts[p].size; | 633 | size = state->parts[p].size; |
586 | if (!size) | 634 | if (!size) |
587 | continue; | 635 | continue; |
@@ -589,30 +637,21 @@ try_scan: | |||
589 | from = state->parts[p].from; | 637 | from = state->parts[p].from; |
590 | if (from >= get_capacity(disk)) { | 638 | if (from >= get_capacity(disk)) { |
591 | printk(KERN_WARNING | 639 | printk(KERN_WARNING |
592 | "%s: p%d ignored, start %llu is behind the end of the disk\n", | 640 | "%s: p%d start %llu is beyond EOD, ", |
593 | disk->disk_name, p, (unsigned long long) from); | 641 | disk->disk_name, p, (unsigned long long) from); |
642 | if (disk_unlock_native_capacity(disk)) | ||
643 | goto rescan; | ||
594 | continue; | 644 | continue; |
595 | } | 645 | } |
596 | 646 | ||
597 | if (from + size > get_capacity(disk)) { | 647 | if (from + size > get_capacity(disk)) { |
598 | const struct block_device_operations *bdops = disk->fops; | ||
599 | unsigned long long capacity; | ||
600 | |||
601 | printk(KERN_WARNING | 648 | printk(KERN_WARNING |
602 | "%s: p%d size %llu exceeds device capacity, ", | 649 | "%s: p%d size %llu extends beyond EOD, ", |
603 | disk->disk_name, p, (unsigned long long) size); | 650 | disk->disk_name, p, (unsigned long long) size); |
604 | 651 | ||
605 | if (bdops->set_capacity && | 652 | if (disk_unlock_native_capacity(disk)) { |
606 | (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { | 653 | /* free state and restart */ |
607 | printk(KERN_CONT "enabling native capacity\n"); | 654 | goto rescan; |
608 | capacity = bdops->set_capacity(disk, ~0ULL); | ||
609 | disk->flags |= GENHD_FL_NATIVE_CAPACITY; | ||
610 | if (capacity > get_capacity(disk)) { | ||
611 | set_capacity(disk, capacity); | ||
612 | check_disk_size_change(disk, bdev); | ||
613 | bdev->bd_invalidated = 0; | ||
614 | } | ||
615 | goto try_scan; | ||
616 | } else { | 655 | } else { |
617 | /* | 656 | /* |
618 | * we can not ignore partitions of broken tables | 657 | * we can not ignore partitions of broken tables |
@@ -620,7 +659,6 @@ try_scan: | |||
620 | * we limit them to the end of the disk to avoid | 659 | * we limit them to the end of the disk to avoid |
621 | * creating invalid block devices | 660 | * creating invalid block devices |
622 | */ | 661 | */ |
623 | printk(KERN_CONT "limited to end of disk\n"); | ||
624 | size = get_capacity(disk) - from; | 662 | size = get_capacity(disk) - from; |
625 | } | 663 | } |
626 | } | 664 | } |
diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 98dbe1a84528..52f8bd399396 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h | |||
@@ -6,6 +6,7 @@ | |||
6 | * description. | 6 | * description. |
7 | */ | 7 | */ |
8 | struct parsed_partitions { | 8 | struct parsed_partitions { |
9 | struct block_device *bdev; | ||
9 | char name[BDEVNAME_SIZE]; | 10 | char name[BDEVNAME_SIZE]; |
10 | struct { | 11 | struct { |
11 | sector_t from; | 12 | sector_t from; |
@@ -14,8 +15,19 @@ struct parsed_partitions { | |||
14 | } parts[DISK_MAX_PARTS]; | 15 | } parts[DISK_MAX_PARTS]; |
15 | int next; | 16 | int next; |
16 | int limit; | 17 | int limit; |
18 | bool access_beyond_eod; | ||
17 | }; | 19 | }; |
18 | 20 | ||
21 | static inline void *read_part_sector(struct parsed_partitions *state, | ||
22 | sector_t n, Sector *p) | ||
23 | { | ||
24 | if (n >= get_capacity(state->bdev->bd_disk)) { | ||
25 | state->access_beyond_eod = true; | ||
26 | return NULL; | ||
27 | } | ||
28 | return read_dev_sector(state->bdev, n, p); | ||
29 | } | ||
30 | |||
19 | static inline void | 31 | static inline void |
20 | put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) | 32 | put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) |
21 | { | 33 | { |
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 91babdae7587..9e346c19bbba 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c | |||
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len) | |||
140 | * the part[0] entry for this disk, and is the number of | 140 | * the part[0] entry for this disk, and is the number of |
141 | * physical sectors available on the disk. | 141 | * physical sectors available on the disk. |
142 | */ | 142 | */ |
143 | static u64 | 143 | static u64 last_lba(struct block_device *bdev) |
144 | last_lba(struct block_device *bdev) | ||
145 | { | 144 | { |
146 | if (!bdev || !bdev->bd_inode) | 145 | if (!bdev || !bdev->bd_inode) |
147 | return 0; | 146 | return 0; |
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr) | |||
181 | 180 | ||
182 | /** | 181 | /** |
183 | * read_lba(): Read bytes from disk, starting at given LBA | 182 | * read_lba(): Read bytes from disk, starting at given LBA |
184 | * @bdev | 183 | * @state |
185 | * @lba | 184 | * @lba |
186 | * @buffer | 185 | * @buffer |
187 | * @size_t | 186 | * @size_t |
188 | * | 187 | * |
189 | * Description: Reads @count bytes from @bdev into @buffer. | 188 | * Description: Reads @count bytes from @state->bdev into @buffer. |
190 | * Returns number of bytes read on success, 0 on error. | 189 | * Returns number of bytes read on success, 0 on error. |
191 | */ | 190 | */ |
192 | static size_t | 191 | static size_t read_lba(struct parsed_partitions *state, |
193 | read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) | 192 | u64 lba, u8 *buffer, size_t count) |
194 | { | 193 | { |
195 | size_t totalreadcount = 0; | 194 | size_t totalreadcount = 0; |
195 | struct block_device *bdev = state->bdev; | ||
196 | sector_t n = lba * (bdev_logical_block_size(bdev) / 512); | 196 | sector_t n = lba * (bdev_logical_block_size(bdev) / 512); |
197 | 197 | ||
198 | if (!bdev || !buffer || lba > last_lba(bdev)) | 198 | if (!buffer || lba > last_lba(bdev)) |
199 | return 0; | 199 | return 0; |
200 | 200 | ||
201 | while (count) { | 201 | while (count) { |
202 | int copied = 512; | 202 | int copied = 512; |
203 | Sector sect; | 203 | Sector sect; |
204 | unsigned char *data = read_dev_sector(bdev, n++, §); | 204 | unsigned char *data = read_part_sector(state, n++, §); |
205 | if (!data) | 205 | if (!data) |
206 | break; | 206 | break; |
207 | if (copied > count) | 207 | if (copied > count) |
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) | |||
217 | 217 | ||
218 | /** | 218 | /** |
219 | * alloc_read_gpt_entries(): reads partition entries from disk | 219 | * alloc_read_gpt_entries(): reads partition entries from disk |
220 | * @bdev | 220 | * @state |
221 | * @gpt - GPT header | 221 | * @gpt - GPT header |
222 | * | 222 | * |
223 | * Description: Returns ptes on success, NULL on error. | 223 | * Description: Returns ptes on success, NULL on error. |
224 | * Allocates space for PTEs based on information found in @gpt. | 224 | * Allocates space for PTEs based on information found in @gpt. |
225 | * Notes: remember to free pte when you're done! | 225 | * Notes: remember to free pte when you're done! |
226 | */ | 226 | */ |
227 | static gpt_entry * | 227 | static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, |
228 | alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) | 228 | gpt_header *gpt) |
229 | { | 229 | { |
230 | size_t count; | 230 | size_t count; |
231 | gpt_entry *pte; | 231 | gpt_entry *pte; |
232 | if (!bdev || !gpt) | 232 | |
233 | if (!gpt) | ||
233 | return NULL; | 234 | return NULL; |
234 | 235 | ||
235 | count = le32_to_cpu(gpt->num_partition_entries) * | 236 | count = le32_to_cpu(gpt->num_partition_entries) * |
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) | |||
240 | if (!pte) | 241 | if (!pte) |
241 | return NULL; | 242 | return NULL; |
242 | 243 | ||
243 | if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), | 244 | if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), |
244 | (u8 *) pte, | 245 | (u8 *) pte, |
245 | count) < count) { | 246 | count) < count) { |
246 | kfree(pte); | 247 | kfree(pte); |
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) | |||
252 | 253 | ||
253 | /** | 254 | /** |
254 | * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk | 255 | * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk |
255 | * @bdev | 256 | * @state |
256 | * @lba is the Logical Block Address of the partition table | 257 | * @lba is the Logical Block Address of the partition table |
257 | * | 258 | * |
258 | * Description: returns GPT header on success, NULL on error. Allocates | 259 | * Description: returns GPT header on success, NULL on error. Allocates |
259 | * and fills a GPT header starting at @ from @bdev. | 260 | * and fills a GPT header starting at @ from @state->bdev. |
260 | * Note: remember to free gpt when finished with it. | 261 | * Note: remember to free gpt when finished with it. |
261 | */ | 262 | */ |
262 | static gpt_header * | 263 | static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, |
263 | alloc_read_gpt_header(struct block_device *bdev, u64 lba) | 264 | u64 lba) |
264 | { | 265 | { |
265 | gpt_header *gpt; | 266 | gpt_header *gpt; |
266 | unsigned ssz = bdev_logical_block_size(bdev); | 267 | unsigned ssz = bdev_logical_block_size(state->bdev); |
267 | |||
268 | if (!bdev) | ||
269 | return NULL; | ||
270 | 268 | ||
271 | gpt = kzalloc(ssz, GFP_KERNEL); | 269 | gpt = kzalloc(ssz, GFP_KERNEL); |
272 | if (!gpt) | 270 | if (!gpt) |
273 | return NULL; | 271 | return NULL; |
274 | 272 | ||
275 | if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { | 273 | if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { |
276 | kfree(gpt); | 274 | kfree(gpt); |
277 | gpt=NULL; | 275 | gpt=NULL; |
278 | return NULL; | 276 | return NULL; |
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) | |||
283 | 281 | ||
284 | /** | 282 | /** |
285 | * is_gpt_valid() - tests one GPT header and PTEs for validity | 283 | * is_gpt_valid() - tests one GPT header and PTEs for validity |
286 | * @bdev | 284 | * @state |
287 | * @lba is the logical block address of the GPT header to test | 285 | * @lba is the logical block address of the GPT header to test |
288 | * @gpt is a GPT header ptr, filled on return. | 286 | * @gpt is a GPT header ptr, filled on return. |
289 | * @ptes is a PTEs ptr, filled on return. | 287 | * @ptes is a PTEs ptr, filled on return. |
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) | |||
291 | * Description: returns 1 if valid, 0 on error. | 289 | * Description: returns 1 if valid, 0 on error. |
292 | * If valid, returns pointers to newly allocated GPT header and PTEs. | 290 | * If valid, returns pointers to newly allocated GPT header and PTEs. |
293 | */ | 291 | */ |
294 | static int | 292 | static int is_gpt_valid(struct parsed_partitions *state, u64 lba, |
295 | is_gpt_valid(struct block_device *bdev, u64 lba, | 293 | gpt_header **gpt, gpt_entry **ptes) |
296 | gpt_header **gpt, gpt_entry **ptes) | ||
297 | { | 294 | { |
298 | u32 crc, origcrc; | 295 | u32 crc, origcrc; |
299 | u64 lastlba; | 296 | u64 lastlba; |
300 | 297 | ||
301 | if (!bdev || !gpt || !ptes) | 298 | if (!ptes) |
302 | return 0; | 299 | return 0; |
303 | if (!(*gpt = alloc_read_gpt_header(bdev, lba))) | 300 | if (!(*gpt = alloc_read_gpt_header(state, lba))) |
304 | return 0; | 301 | return 0; |
305 | 302 | ||
306 | /* Check the GUID Partition Table signature */ | 303 | /* Check the GUID Partition Table signature */ |
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, | |||
336 | /* Check the first_usable_lba and last_usable_lba are | 333 | /* Check the first_usable_lba and last_usable_lba are |
337 | * within the disk. | 334 | * within the disk. |
338 | */ | 335 | */ |
339 | lastlba = last_lba(bdev); | 336 | lastlba = last_lba(state->bdev); |
340 | if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { | 337 | if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { |
341 | pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", | 338 | pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", |
342 | (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), | 339 | (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), |
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, | |||
350 | goto fail; | 347 | goto fail; |
351 | } | 348 | } |
352 | 349 | ||
353 | if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) | 350 | if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) |
354 | goto fail; | 351 | goto fail; |
355 | 352 | ||
356 | /* Check the GUID Partition Entry Array CRC */ | 353 | /* Check the GUID Partition Entry Array CRC */ |
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) | |||
495 | 492 | ||
496 | /** | 493 | /** |
497 | * find_valid_gpt() - Search disk for valid GPT headers and PTEs | 494 | * find_valid_gpt() - Search disk for valid GPT headers and PTEs |
498 | * @bdev | 495 | * @state |
499 | * @gpt is a GPT header ptr, filled on return. | 496 | * @gpt is a GPT header ptr, filled on return. |
500 | * @ptes is a PTEs ptr, filled on return. | 497 | * @ptes is a PTEs ptr, filled on return. |
501 | * Description: Returns 1 if valid, 0 on error. | 498 | * Description: Returns 1 if valid, 0 on error. |
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) | |||
508 | * This protects against devices which misreport their size, and forces | 505 | * This protects against devices which misreport their size, and forces |
509 | * the user to decide to use the Alternate GPT. | 506 | * the user to decide to use the Alternate GPT. |
510 | */ | 507 | */ |
511 | static int | 508 | static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, |
512 | find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) | 509 | gpt_entry **ptes) |
513 | { | 510 | { |
514 | int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; | 511 | int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; |
515 | gpt_header *pgpt = NULL, *agpt = NULL; | 512 | gpt_header *pgpt = NULL, *agpt = NULL; |
516 | gpt_entry *pptes = NULL, *aptes = NULL; | 513 | gpt_entry *pptes = NULL, *aptes = NULL; |
517 | legacy_mbr *legacymbr; | 514 | legacy_mbr *legacymbr; |
518 | u64 lastlba; | 515 | u64 lastlba; |
519 | if (!bdev || !gpt || !ptes) | 516 | |
517 | if (!ptes) | ||
520 | return 0; | 518 | return 0; |
521 | 519 | ||
522 | lastlba = last_lba(bdev); | 520 | lastlba = last_lba(state->bdev); |
523 | if (!force_gpt) { | 521 | if (!force_gpt) { |
524 | /* This will be added to the EFI Spec. per Intel after v1.02. */ | 522 | /* This will be added to the EFI Spec. per Intel after v1.02. */ |
525 | legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); | 523 | legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); |
526 | if (legacymbr) { | 524 | if (legacymbr) { |
527 | read_lba(bdev, 0, (u8 *) legacymbr, | 525 | read_lba(state, 0, (u8 *) legacymbr, |
528 | sizeof (*legacymbr)); | 526 | sizeof (*legacymbr)); |
529 | good_pmbr = is_pmbr_valid(legacymbr); | 527 | good_pmbr = is_pmbr_valid(legacymbr); |
530 | kfree(legacymbr); | 528 | kfree(legacymbr); |
531 | } | 529 | } |
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) | |||
533 | goto fail; | 531 | goto fail; |
534 | } | 532 | } |
535 | 533 | ||
536 | good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, | 534 | good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, |
537 | &pgpt, &pptes); | 535 | &pgpt, &pptes); |
538 | if (good_pgpt) | 536 | if (good_pgpt) |
539 | good_agpt = is_gpt_valid(bdev, | 537 | good_agpt = is_gpt_valid(state, |
540 | le64_to_cpu(pgpt->alternate_lba), | 538 | le64_to_cpu(pgpt->alternate_lba), |
541 | &agpt, &aptes); | 539 | &agpt, &aptes); |
542 | if (!good_agpt && force_gpt) | 540 | if (!good_agpt && force_gpt) |
543 | good_agpt = is_gpt_valid(bdev, lastlba, | 541 | good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); |
544 | &agpt, &aptes); | ||
545 | 542 | ||
546 | /* The obviously unsuccessful case */ | 543 | /* The obviously unsuccessful case */ |
547 | if (!good_pgpt && !good_agpt) | 544 | if (!good_pgpt && !good_agpt) |
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) | |||
583 | } | 580 | } |
584 | 581 | ||
585 | /** | 582 | /** |
586 | * efi_partition(struct parsed_partitions *state, struct block_device *bdev) | 583 | * efi_partition(struct parsed_partitions *state) |
587 | * @state | 584 | * @state |
588 | * @bdev | ||
589 | * | 585 | * |
590 | * Description: called from check.c, if the disk contains GPT | 586 | * Description: called from check.c, if the disk contains GPT |
591 | * partitions, sets up partition entries in the kernel. | 587 | * partitions, sets up partition entries in the kernel. |
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) | |||
602 | * 1 if successful | 598 | * 1 if successful |
603 | * | 599 | * |
604 | */ | 600 | */ |
605 | int | 601 | int efi_partition(struct parsed_partitions *state) |
606 | efi_partition(struct parsed_partitions *state, struct block_device *bdev) | ||
607 | { | 602 | { |
608 | gpt_header *gpt = NULL; | 603 | gpt_header *gpt = NULL; |
609 | gpt_entry *ptes = NULL; | 604 | gpt_entry *ptes = NULL; |
610 | u32 i; | 605 | u32 i; |
611 | unsigned ssz = bdev_logical_block_size(bdev) / 512; | 606 | unsigned ssz = bdev_logical_block_size(state->bdev) / 512; |
612 | 607 | ||
613 | if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { | 608 | if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { |
614 | kfree(gpt); | 609 | kfree(gpt); |
615 | kfree(ptes); | 610 | kfree(ptes); |
616 | return 0; | 611 | return 0; |
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
623 | u64 size = le64_to_cpu(ptes[i].ending_lba) - | 618 | u64 size = le64_to_cpu(ptes[i].ending_lba) - |
624 | le64_to_cpu(ptes[i].starting_lba) + 1ULL; | 619 | le64_to_cpu(ptes[i].starting_lba) + 1ULL; |
625 | 620 | ||
626 | if (!is_pte_valid(&ptes[i], last_lba(bdev))) | 621 | if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) |
627 | continue; | 622 | continue; |
628 | 623 | ||
629 | put_partition(state, i+1, start * ssz, size * ssz); | 624 | put_partition(state, i+1, start * ssz, size * ssz); |
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h index 6998b589abf9..b69ab729558f 100644 --- a/fs/partitions/efi.h +++ b/fs/partitions/efi.h | |||
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr { | |||
110 | } __attribute__ ((packed)) legacy_mbr; | 110 | } __attribute__ ((packed)) legacy_mbr; |
111 | 111 | ||
112 | /* Functions */ | 112 | /* Functions */ |
113 | extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); | 113 | extern int efi_partition(struct parsed_partitions *state); |
114 | 114 | ||
115 | #endif | 115 | #endif |
116 | 116 | ||
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index fc71aab08460..3e73de5967ff 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c | |||
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { | |||
58 | 58 | ||
59 | /* | 59 | /* |
60 | */ | 60 | */ |
61 | int | 61 | int ibm_partition(struct parsed_partitions *state) |
62 | ibm_partition(struct parsed_partitions *state, struct block_device *bdev) | ||
63 | { | 62 | { |
63 | struct block_device *bdev = state->bdev; | ||
64 | int blocksize, res; | 64 | int blocksize, res; |
65 | loff_t i_size, offset, size, fmt_size; | 65 | loff_t i_size, offset, size, fmt_size; |
66 | dasd_information2_t *info; | 66 | dasd_information2_t *info; |
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
100 | /* | 100 | /* |
101 | * Get volume label, extract name and type. | 101 | * Get volume label, extract name and type. |
102 | */ | 102 | */ |
103 | data = read_dev_sector(bdev, info->label_block*(blocksize/512), §); | 103 | data = read_part_sector(state, info->label_block*(blocksize/512), |
104 | §); | ||
104 | if (data == NULL) | 105 | if (data == NULL) |
105 | goto out_readerr; | 106 | goto out_readerr; |
106 | 107 | ||
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
193 | */ | 194 | */ |
194 | blk = cchhb2blk(&label->vol.vtoc, geo) + 1; | 195 | blk = cchhb2blk(&label->vol.vtoc, geo) + 1; |
195 | counter = 0; | 196 | counter = 0; |
196 | data = read_dev_sector(bdev, blk * (blocksize/512), | 197 | data = read_part_sector(state, blk * (blocksize/512), |
197 | §); | 198 | §); |
198 | while (data != NULL) { | 199 | while (data != NULL) { |
199 | struct vtoc_format1_label f1; | 200 | struct vtoc_format1_label f1; |
200 | 201 | ||
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
208 | || f1.DS1FMTID == _ascebc['7'] | 209 | || f1.DS1FMTID == _ascebc['7'] |
209 | || f1.DS1FMTID == _ascebc['9']) { | 210 | || f1.DS1FMTID == _ascebc['9']) { |
210 | blk++; | 211 | blk++; |
211 | data = read_dev_sector(bdev, blk * | 212 | data = read_part_sector(state, |
212 | (blocksize/512), | 213 | blk * (blocksize/512), §); |
213 | §); | ||
214 | continue; | 214 | continue; |
215 | } | 215 | } |
216 | 216 | ||
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
230 | size * (blocksize >> 9)); | 230 | size * (blocksize >> 9)); |
231 | counter++; | 231 | counter++; |
232 | blk++; | 232 | blk++; |
233 | data = read_dev_sector(bdev, | 233 | data = read_part_sector(state, |
234 | blk * (blocksize/512), | 234 | blk * (blocksize/512), §); |
235 | §); | ||
236 | } | 235 | } |
237 | 236 | ||
238 | if (!data) | 237 | if (!data) |
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h index 31f85a6ac459..08fb0804a812 100644 --- a/fs/partitions/ibm.h +++ b/fs/partitions/ibm.h | |||
@@ -1 +1 @@ | |||
int ibm_partition(struct parsed_partitions *, struct block_device *); | int ibm_partition(struct parsed_partitions *); | ||
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c index 176d89bcf123..1cc928bb762f 100644 --- a/fs/partitions/karma.c +++ b/fs/partitions/karma.c | |||
@@ -9,7 +9,7 @@ | |||
9 | #include "check.h" | 9 | #include "check.h" |
10 | #include "karma.h" | 10 | #include "karma.h" |
11 | 11 | ||
12 | int karma_partition(struct parsed_partitions *state, struct block_device *bdev) | 12 | int karma_partition(struct parsed_partitions *state) |
13 | { | 13 | { |
14 | int i; | 14 | int i; |
15 | int slot = 1; | 15 | int slot = 1; |
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
29 | } __attribute__((packed)) *label; | 29 | } __attribute__((packed)) *label; |
30 | struct d_partition *p; | 30 | struct d_partition *p; |
31 | 31 | ||
32 | data = read_dev_sector(bdev, 0, §); | 32 | data = read_part_sector(state, 0, §); |
33 | if (!data) | 33 | if (!data) |
34 | return -1; | 34 | return -1; |
35 | 35 | ||
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h index ecf7d3f2a3d8..c764b2e9df21 100644 --- a/fs/partitions/karma.h +++ b/fs/partitions/karma.h | |||
@@ -4,5 +4,5 @@ | |||
4 | 4 | ||
5 | #define KARMA_LABEL_MAGIC 0xAB56 | 5 | #define KARMA_LABEL_MAGIC 0xAB56 |
6 | 6 | ||
7 | int karma_partition(struct parsed_partitions *state, struct block_device *bdev); | 7 | int karma_partition(struct parsed_partitions *state); |
8 | 8 | ||
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 8652fb99e962..3ceca05b668c 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c | |||
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, | |||
309 | 309 | ||
310 | /** | 310 | /** |
311 | * ldm_validate_privheads - Compare the primary privhead with its backups | 311 | * ldm_validate_privheads - Compare the primary privhead with its backups |
312 | * @bdev: Device holding the LDM Database | 312 | * @state: Partition check state including device holding the LDM Database |
313 | * @ph1: Memory struct to fill with ph contents | 313 | * @ph1: Memory struct to fill with ph contents |
314 | * | 314 | * |
315 | * Read and compare all three privheads from disk. | 315 | * Read and compare all three privheads from disk. |
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, | |||
321 | * Return: 'true' Success | 321 | * Return: 'true' Success |
322 | * 'false' Error | 322 | * 'false' Error |
323 | */ | 323 | */ |
324 | static bool ldm_validate_privheads (struct block_device *bdev, | 324 | static bool ldm_validate_privheads(struct parsed_partitions *state, |
325 | struct privhead *ph1) | 325 | struct privhead *ph1) |
326 | { | 326 | { |
327 | static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; | 327 | static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; |
328 | struct privhead *ph[3] = { ph1 }; | 328 | struct privhead *ph[3] = { ph1 }; |
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, | |||
332 | long num_sects; | 332 | long num_sects; |
333 | int i; | 333 | int i; |
334 | 334 | ||
335 | BUG_ON (!bdev || !ph1); | 335 | BUG_ON (!state || !ph1); |
336 | 336 | ||
337 | ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); | 337 | ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); |
338 | ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); | 338 | ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); |
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev, | |||
346 | 346 | ||
347 | /* Read and parse privheads */ | 347 | /* Read and parse privheads */ |
348 | for (i = 0; i < 3; i++) { | 348 | for (i = 0; i < 3; i++) { |
349 | data = read_dev_sector (bdev, | 349 | data = read_part_sector(state, ph[0]->config_start + off[i], |
350 | ph[0]->config_start + off[i], §); | 350 | §); |
351 | if (!data) { | 351 | if (!data) { |
352 | ldm_crit ("Disk read failed."); | 352 | ldm_crit ("Disk read failed."); |
353 | goto out; | 353 | goto out; |
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, | |||
363 | } | 363 | } |
364 | } | 364 | } |
365 | 365 | ||
366 | num_sects = bdev->bd_inode->i_size >> 9; | 366 | num_sects = state->bdev->bd_inode->i_size >> 9; |
367 | 367 | ||
368 | if ((ph[0]->config_start > num_sects) || | 368 | if ((ph[0]->config_start > num_sects) || |
369 | ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { | 369 | ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { |
@@ -397,20 +397,20 @@ out: | |||
397 | 397 | ||
398 | /** | 398 | /** |
399 | * ldm_validate_tocblocks - Validate the table of contents and its backups | 399 | * ldm_validate_tocblocks - Validate the table of contents and its backups |
400 | * @bdev: Device holding the LDM Database | 400 | * @state: Partition check state including device holding the LDM Database |
401 | * @base: Offset, into @bdev, of the database | 401 | * @base: Offset, into @state->bdev, of the database |
402 | * @ldb: Cache of the database structures | 402 | * @ldb: Cache of the database structures |
403 | * | 403 | * |
404 | * Find and compare the four tables of contents of the LDM Database stored on | 404 | * Find and compare the four tables of contents of the LDM Database stored on |
405 | * @bdev and return the parsed information into @toc1. | 405 | * @state->bdev and return the parsed information into @toc1. |
406 | * | 406 | * |
407 | * The offsets and sizes of the configs are range-checked against a privhead. | 407 | * The offsets and sizes of the configs are range-checked against a privhead. |
408 | * | 408 | * |
409 | * Return: 'true' @toc1 contains validated TOCBLOCK info | 409 | * Return: 'true' @toc1 contains validated TOCBLOCK info |
410 | * 'false' @toc1 contents are undefined | 410 | * 'false' @toc1 contents are undefined |
411 | */ | 411 | */ |
412 | static bool ldm_validate_tocblocks(struct block_device *bdev, | 412 | static bool ldm_validate_tocblocks(struct parsed_partitions *state, |
413 | unsigned long base, struct ldmdb *ldb) | 413 | unsigned long base, struct ldmdb *ldb) |
414 | { | 414 | { |
415 | static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; | 415 | static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; |
416 | struct tocblock *tb[4]; | 416 | struct tocblock *tb[4]; |
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, | |||
420 | int i, nr_tbs; | 420 | int i, nr_tbs; |
421 | bool result = false; | 421 | bool result = false; |
422 | 422 | ||
423 | BUG_ON(!bdev || !ldb); | 423 | BUG_ON(!state || !ldb); |
424 | ph = &ldb->ph; | 424 | ph = &ldb->ph; |
425 | tb[0] = &ldb->toc; | 425 | tb[0] = &ldb->toc; |
426 | tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); | 426 | tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); |
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, | |||
437 | * skip any that fail as long as we get at least one valid TOCBLOCK. | 437 | * skip any that fail as long as we get at least one valid TOCBLOCK. |
438 | */ | 438 | */ |
439 | for (nr_tbs = i = 0; i < 4; i++) { | 439 | for (nr_tbs = i = 0; i < 4; i++) { |
440 | data = read_dev_sector(bdev, base + off[i], §); | 440 | data = read_part_sector(state, base + off[i], §); |
441 | if (!data) { | 441 | if (!data) { |
442 | ldm_error("Disk read failed for TOCBLOCK %d.", i); | 442 | ldm_error("Disk read failed for TOCBLOCK %d.", i); |
443 | continue; | 443 | continue; |
@@ -473,7 +473,7 @@ err: | |||
473 | 473 | ||
474 | /** | 474 | /** |
475 | * ldm_validate_vmdb - Read the VMDB and validate it | 475 | * ldm_validate_vmdb - Read the VMDB and validate it |
476 | * @bdev: Device holding the LDM Database | 476 | * @state: Partition check state including device holding the LDM Database |
477 | * @base: Offset, into @bdev, of the database | 477 | * @base: Offset, into @bdev, of the database |
478 | * @ldb: Cache of the database structures | 478 | * @ldb: Cache of the database structures |
479 | * | 479 | * |
@@ -483,8 +483,8 @@ err: | |||
483 | * Return: 'true' @ldb contains validated VBDB info | 483 | * Return: 'true' @ldb contains validated VBDB info |
484 | * 'false' @ldb contents are undefined | 484 | * 'false' @ldb contents are undefined |
485 | */ | 485 | */ |
486 | static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, | 486 | static bool ldm_validate_vmdb(struct parsed_partitions *state, |
487 | struct ldmdb *ldb) | 487 | unsigned long base, struct ldmdb *ldb) |
488 | { | 488 | { |
489 | Sector sect; | 489 | Sector sect; |
490 | u8 *data; | 490 | u8 *data; |
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, | |||
492 | struct vmdb *vm; | 492 | struct vmdb *vm; |
493 | struct tocblock *toc; | 493 | struct tocblock *toc; |
494 | 494 | ||
495 | BUG_ON (!bdev || !ldb); | 495 | BUG_ON (!state || !ldb); |
496 | 496 | ||
497 | vm = &ldb->vm; | 497 | vm = &ldb->vm; |
498 | toc = &ldb->toc; | 498 | toc = &ldb->toc; |
499 | 499 | ||
500 | data = read_dev_sector (bdev, base + OFF_VMDB, §); | 500 | data = read_part_sector(state, base + OFF_VMDB, §); |
501 | if (!data) { | 501 | if (!data) { |
502 | ldm_crit ("Disk read failed."); | 502 | ldm_crit ("Disk read failed."); |
503 | return false; | 503 | return false; |
@@ -534,21 +534,21 @@ out: | |||
534 | 534 | ||
535 | /** | 535 | /** |
536 | * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk | 536 | * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk |
537 | * @bdev: Device holding the LDM Database | 537 | * @state: Partition check state including device holding the LDM Database |
538 | * | 538 | * |
539 | * This function provides a weak test to decide whether the device is a dynamic | 539 | * This function provides a weak test to decide whether the device is a dynamic |
540 | * disk or not. It looks for an MS-DOS-style partition table containing at | 540 | * disk or not. It looks for an MS-DOS-style partition table containing at |
541 | * least one partition of type 0x42 (formerly SFS, now used by Windows for | 541 | * least one partition of type 0x42 (formerly SFS, now used by Windows for |
542 | * dynamic disks). | 542 | * dynamic disks). |
543 | * | 543 | * |
544 | * N.B. The only possible error can come from the read_dev_sector and that is | 544 | * N.B. The only possible error can come from the read_part_sector and that is |
545 | * only likely to happen if the underlying device is strange. If that IS | 545 | * only likely to happen if the underlying device is strange. If that IS |
546 | * the case we should return zero to let someone else try. | 546 | * the case we should return zero to let someone else try. |
547 | * | 547 | * |
548 | * Return: 'true' @bdev is a dynamic disk | 548 | * Return: 'true' @state->bdev is a dynamic disk |
549 | * 'false' @bdev is not a dynamic disk, or an error occurred | 549 | * 'false' @state->bdev is not a dynamic disk, or an error occurred |
550 | */ | 550 | */ |
551 | static bool ldm_validate_partition_table (struct block_device *bdev) | 551 | static bool ldm_validate_partition_table(struct parsed_partitions *state) |
552 | { | 552 | { |
553 | Sector sect; | 553 | Sector sect; |
554 | u8 *data; | 554 | u8 *data; |
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev) | |||
556 | int i; | 556 | int i; |
557 | bool result = false; | 557 | bool result = false; |
558 | 558 | ||
559 | BUG_ON (!bdev); | 559 | BUG_ON(!state); |
560 | 560 | ||
561 | data = read_dev_sector (bdev, 0, §); | 561 | data = read_part_sector(state, 0, §); |
562 | if (!data) { | 562 | if (!data) { |
563 | ldm_crit ("Disk read failed."); | 563 | ldm_crit ("Disk read failed."); |
564 | return false; | 564 | return false; |
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) | |||
1391 | 1391 | ||
1392 | /** | 1392 | /** |
1393 | * ldm_get_vblks - Read the on-disk database of VBLKs into memory | 1393 | * ldm_get_vblks - Read the on-disk database of VBLKs into memory |
1394 | * @bdev: Device holding the LDM Database | 1394 | * @state: Partition check state including device holding the LDM Database |
1395 | * @base: Offset, into @bdev, of the database | 1395 | * @base: Offset, into @state->bdev, of the database |
1396 | * @ldb: Cache of the database structures | 1396 | * @ldb: Cache of the database structures |
1397 | * | 1397 | * |
1398 | * To use the information from the VBLKs, they need to be read from the disk, | 1398 | * To use the information from the VBLKs, they need to be read from the disk, |
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) | |||
1401 | * Return: 'true' All the VBLKs were read successfully | 1401 | * Return: 'true' All the VBLKs were read successfully |
1402 | * 'false' An error occurred | 1402 | * 'false' An error occurred |
1403 | */ | 1403 | */ |
1404 | static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, | 1404 | static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, |
1405 | struct ldmdb *ldb) | 1405 | struct ldmdb *ldb) |
1406 | { | 1406 | { |
1407 | int size, perbuf, skip, finish, s, v, recs; | 1407 | int size, perbuf, skip, finish, s, v, recs; |
1408 | u8 *data = NULL; | 1408 | u8 *data = NULL; |
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, | |||
1410 | bool result = false; | 1410 | bool result = false; |
1411 | LIST_HEAD (frags); | 1411 | LIST_HEAD (frags); |
1412 | 1412 | ||
1413 | BUG_ON (!bdev || !ldb); | 1413 | BUG_ON(!state || !ldb); |
1414 | 1414 | ||
1415 | size = ldb->vm.vblk_size; | 1415 | size = ldb->vm.vblk_size; |
1416 | perbuf = 512 / size; | 1416 | perbuf = 512 / size; |
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, | |||
1418 | finish = (size * ldb->vm.last_vblk_seq) >> 9; | 1418 | finish = (size * ldb->vm.last_vblk_seq) >> 9; |
1419 | 1419 | ||
1420 | for (s = skip; s < finish; s++) { /* For each sector */ | 1420 | for (s = skip; s < finish; s++) { /* For each sector */ |
1421 | data = read_dev_sector (bdev, base + OFF_VMDB + s, §); | 1421 | data = read_part_sector(state, base + OFF_VMDB + s, §); |
1422 | if (!data) { | 1422 | if (!data) { |
1423 | ldm_crit ("Disk read failed."); | 1423 | ldm_crit ("Disk read failed."); |
1424 | goto out; | 1424 | goto out; |
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh) | |||
1474 | 1474 | ||
1475 | /** | 1475 | /** |
1476 | * ldm_partition - Find out whether a device is a dynamic disk and handle it | 1476 | * ldm_partition - Find out whether a device is a dynamic disk and handle it |
1477 | * @pp: List of the partitions parsed so far | 1477 | * @state: Partition check state including device holding the LDM Database |
1478 | * @bdev: Device holding the LDM Database | ||
1479 | * | 1478 | * |
1480 | * This determines whether the device @bdev is a dynamic disk and if so creates | 1479 | * This determines whether the device @bdev is a dynamic disk and if so creates |
1481 | * the partitions necessary in the gendisk structure pointed to by @hd. | 1480 | * the partitions necessary in the gendisk structure pointed to by @hd. |
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh) | |||
1485 | * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, | 1484 | * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, |
1486 | * and so on: the actual data containing partitions. | 1485 | * and so on: the actual data containing partitions. |
1487 | * | 1486 | * |
1488 | * Return: 1 Success, @bdev is a dynamic disk and we handled it | 1487 | * Return: 1 Success, @state->bdev is a dynamic disk and we handled it |
1489 | * 0 Success, @bdev is not a dynamic disk | 1488 | * 0 Success, @state->bdev is not a dynamic disk |
1490 | * -1 An error occurred before enough information had been read | 1489 | * -1 An error occurred before enough information had been read |
1491 | * Or @bdev is a dynamic disk, but it may be corrupted | 1490 | * Or @state->bdev is a dynamic disk, but it may be corrupted |
1492 | */ | 1491 | */ |
1493 | int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) | 1492 | int ldm_partition(struct parsed_partitions *state) |
1494 | { | 1493 | { |
1495 | struct ldmdb *ldb; | 1494 | struct ldmdb *ldb; |
1496 | unsigned long base; | 1495 | unsigned long base; |
1497 | int result = -1; | 1496 | int result = -1; |
1498 | 1497 | ||
1499 | BUG_ON (!pp || !bdev); | 1498 | BUG_ON(!state); |
1500 | 1499 | ||
1501 | /* Look for signs of a Dynamic Disk */ | 1500 | /* Look for signs of a Dynamic Disk */ |
1502 | if (!ldm_validate_partition_table (bdev)) | 1501 | if (!ldm_validate_partition_table(state)) |
1503 | return 0; | 1502 | return 0; |
1504 | 1503 | ||
1505 | ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); | 1504 | ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); |
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) | |||
1509 | } | 1508 | } |
1510 | 1509 | ||
1511 | /* Parse and check privheads. */ | 1510 | /* Parse and check privheads. */ |
1512 | if (!ldm_validate_privheads (bdev, &ldb->ph)) | 1511 | if (!ldm_validate_privheads(state, &ldb->ph)) |
1513 | goto out; /* Already logged */ | 1512 | goto out; /* Already logged */ |
1514 | 1513 | ||
1515 | /* All further references are relative to base (database start). */ | 1514 | /* All further references are relative to base (database start). */ |
1516 | base = ldb->ph.config_start; | 1515 | base = ldb->ph.config_start; |
1517 | 1516 | ||
1518 | /* Parse and check tocs and vmdb. */ | 1517 | /* Parse and check tocs and vmdb. */ |
1519 | if (!ldm_validate_tocblocks (bdev, base, ldb) || | 1518 | if (!ldm_validate_tocblocks(state, base, ldb) || |
1520 | !ldm_validate_vmdb (bdev, base, ldb)) | 1519 | !ldm_validate_vmdb(state, base, ldb)) |
1521 | goto out; /* Already logged */ | 1520 | goto out; /* Already logged */ |
1522 | 1521 | ||
1523 | /* Initialize vblk lists in ldmdb struct */ | 1522 | /* Initialize vblk lists in ldmdb struct */ |
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) | |||
1527 | INIT_LIST_HEAD (&ldb->v_comp); | 1526 | INIT_LIST_HEAD (&ldb->v_comp); |
1528 | INIT_LIST_HEAD (&ldb->v_part); | 1527 | INIT_LIST_HEAD (&ldb->v_part); |
1529 | 1528 | ||
1530 | if (!ldm_get_vblks (bdev, base, ldb)) { | 1529 | if (!ldm_get_vblks(state, base, ldb)) { |
1531 | ldm_crit ("Failed to read the VBLKs from the database."); | 1530 | ldm_crit ("Failed to read the VBLKs from the database."); |
1532 | goto cleanup; | 1531 | goto cleanup; |
1533 | } | 1532 | } |
1534 | 1533 | ||
1535 | /* Finally, create the data partition devices. */ | 1534 | /* Finally, create the data partition devices. */ |
1536 | if (ldm_create_data_partitions (pp, ldb)) { | 1535 | if (ldm_create_data_partitions(state, ldb)) { |
1537 | ldm_debug ("Parsed LDM database successfully."); | 1536 | ldm_debug ("Parsed LDM database successfully."); |
1538 | result = 1; | 1537 | result = 1; |
1539 | } | 1538 | } |
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h index 30e08e809c1d..d1fb50b28d86 100644 --- a/fs/partitions/ldm.h +++ b/fs/partitions/ldm.h | |||
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */ | |||
209 | struct list_head v_part; | 209 | struct list_head v_part; |
210 | }; | 210 | }; |
211 | 211 | ||
212 | int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); | 212 | int ldm_partition(struct parsed_partitions *state); |
213 | 213 | ||
214 | #endif /* _FS_PT_LDM_H_ */ | 214 | #endif /* _FS_PT_LDM_H_ */ |
215 | 215 | ||
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index d4a0fad3563b..13e27b0082f2 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c | |||
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len) | |||
27 | stg[i] = 0; | 27 | stg[i] = 0; |
28 | } | 28 | } |
29 | 29 | ||
30 | int mac_partition(struct parsed_partitions *state, struct block_device *bdev) | 30 | int mac_partition(struct parsed_partitions *state) |
31 | { | 31 | { |
32 | int slot = 1; | 32 | int slot = 1; |
33 | Sector sect; | 33 | Sector sect; |
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
42 | struct mac_driver_desc *md; | 42 | struct mac_driver_desc *md; |
43 | 43 | ||
44 | /* Get 0th block and look at the first partition map entry. */ | 44 | /* Get 0th block and look at the first partition map entry. */ |
45 | md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, §); | 45 | md = read_part_sector(state, 0, §); |
46 | if (!md) | 46 | if (!md) |
47 | return -1; | 47 | return -1; |
48 | if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { | 48 | if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { |
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
51 | } | 51 | } |
52 | secsize = be16_to_cpu(md->block_size); | 52 | secsize = be16_to_cpu(md->block_size); |
53 | put_dev_sector(sect); | 53 | put_dev_sector(sect); |
54 | data = read_dev_sector(bdev, secsize/512, §); | 54 | data = read_part_sector(state, secsize/512, §); |
55 | if (!data) | 55 | if (!data) |
56 | return -1; | 56 | return -1; |
57 | part = (struct mac_partition *) (data + secsize%512); | 57 | part = (struct mac_partition *) (data + secsize%512); |
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
64 | for (blk = 1; blk <= blocks_in_map; ++blk) { | 64 | for (blk = 1; blk <= blocks_in_map; ++blk) { |
65 | int pos = blk * secsize; | 65 | int pos = blk * secsize; |
66 | put_dev_sector(sect); | 66 | put_dev_sector(sect); |
67 | data = read_dev_sector(bdev, pos/512, §); | 67 | data = read_part_sector(state, pos/512, §); |
68 | if (!data) | 68 | if (!data) |
69 | return -1; | 69 | return -1; |
70 | part = (struct mac_partition *) (data + pos%512); | 70 | part = (struct mac_partition *) (data + pos%512); |
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
123 | } | 123 | } |
124 | #ifdef CONFIG_PPC_PMAC | 124 | #ifdef CONFIG_PPC_PMAC |
125 | if (found_root_goodness) | 125 | if (found_root_goodness) |
126 | note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); | 126 | note_bootable_part(state->bdev->bd_dev, found_root, |
127 | found_root_goodness); | ||
127 | #endif | 128 | #endif |
128 | 129 | ||
129 | put_dev_sector(sect); | 130 | put_dev_sector(sect); |
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h index bbf26e1386fa..3c7d98436380 100644 --- a/fs/partitions/mac.h +++ b/fs/partitions/mac.h | |||
@@ -41,4 +41,4 @@ struct mac_driver_desc { | |||
41 | /* ... more stuff */ | 41 | /* ... more stuff */ |
42 | }; | 42 | }; |
43 | 43 | ||
44 | int mac_partition(struct parsed_partitions *state, struct block_device *bdev); | 44 | int mac_partition(struct parsed_partitions *state); |
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 90be97f1f5a8..645a68d8c055 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c | |||
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p) | |||
64 | #define AIX_LABEL_MAGIC2 0xC2 | 64 | #define AIX_LABEL_MAGIC2 0xC2 |
65 | #define AIX_LABEL_MAGIC3 0xD4 | 65 | #define AIX_LABEL_MAGIC3 0xD4 |
66 | #define AIX_LABEL_MAGIC4 0xC1 | 66 | #define AIX_LABEL_MAGIC4 0xC1 |
67 | static int aix_magic_present(unsigned char *p, struct block_device *bdev) | 67 | static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) |
68 | { | 68 | { |
69 | struct partition *pt = (struct partition *) (p + 0x1be); | 69 | struct partition *pt = (struct partition *) (p + 0x1be); |
70 | Sector sect; | 70 | Sector sect; |
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) | |||
85 | is_extended_partition(pt)) | 85 | is_extended_partition(pt)) |
86 | return 0; | 86 | return 0; |
87 | } | 87 | } |
88 | d = read_dev_sector(bdev, 7, §); | 88 | d = read_part_sector(state, 7, §); |
89 | if (d) { | 89 | if (d) { |
90 | if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') | 90 | if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') |
91 | ret = 1; | 91 | ret = 1; |
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) | |||
105 | * only for the actual data partitions. | 105 | * only for the actual data partitions. |
106 | */ | 106 | */ |
107 | 107 | ||
108 | static void | 108 | static void parse_extended(struct parsed_partitions *state, |
109 | parse_extended(struct parsed_partitions *state, struct block_device *bdev, | 109 | sector_t first_sector, sector_t first_size) |
110 | sector_t first_sector, sector_t first_size) | ||
111 | { | 110 | { |
112 | struct partition *p; | 111 | struct partition *p; |
113 | Sector sect; | 112 | Sector sect; |
114 | unsigned char *data; | 113 | unsigned char *data; |
115 | sector_t this_sector, this_size; | 114 | sector_t this_sector, this_size; |
116 | sector_t sector_size = bdev_logical_block_size(bdev) / 512; | 115 | sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; |
117 | int loopct = 0; /* number of links followed | 116 | int loopct = 0; /* number of links followed |
118 | without finding a data partition */ | 117 | without finding a data partition */ |
119 | int i; | 118 | int i; |
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev, | |||
126 | return; | 125 | return; |
127 | if (state->next == state->limit) | 126 | if (state->next == state->limit) |
128 | return; | 127 | return; |
129 | data = read_dev_sector(bdev, this_sector, §); | 128 | data = read_part_sector(state, this_sector, §); |
130 | if (!data) | 129 | if (!data) |
131 | return; | 130 | return; |
132 | 131 | ||
@@ -198,9 +197,8 @@ done: | |||
198 | /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also | 197 | /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also |
199 | indicates linux swap. Be careful before believing this is Solaris. */ | 198 | indicates linux swap. Be careful before believing this is Solaris. */ |
200 | 199 | ||
201 | static void | 200 | static void parse_solaris_x86(struct parsed_partitions *state, |
202 | parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, | 201 | sector_t offset, sector_t size, int origin) |
203 | sector_t offset, sector_t size, int origin) | ||
204 | { | 202 | { |
205 | #ifdef CONFIG_SOLARIS_X86_PARTITION | 203 | #ifdef CONFIG_SOLARIS_X86_PARTITION |
206 | Sector sect; | 204 | Sector sect; |
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, | |||
208 | int i; | 206 | int i; |
209 | short max_nparts; | 207 | short max_nparts; |
210 | 208 | ||
211 | v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, §); | 209 | v = read_part_sector(state, offset + 1, §); |
212 | if (!v) | 210 | if (!v) |
213 | return; | 211 | return; |
214 | if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { | 212 | if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { |
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, | |||
245 | * Create devices for BSD partitions listed in a disklabel, under a | 243 | * Create devices for BSD partitions listed in a disklabel, under a |
246 | * dos-like partition. See parse_extended() for more information. | 244 | * dos-like partition. See parse_extended() for more information. |
247 | */ | 245 | */ |
248 | static void | 246 | static void parse_bsd(struct parsed_partitions *state, |
249 | parse_bsd(struct parsed_partitions *state, struct block_device *bdev, | 247 | sector_t offset, sector_t size, int origin, char *flavour, |
250 | sector_t offset, sector_t size, int origin, char *flavour, | 248 | int max_partitions) |
251 | int max_partitions) | ||
252 | { | 249 | { |
253 | Sector sect; | 250 | Sector sect; |
254 | struct bsd_disklabel *l; | 251 | struct bsd_disklabel *l; |
255 | struct bsd_partition *p; | 252 | struct bsd_partition *p; |
256 | 253 | ||
257 | l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, §); | 254 | l = read_part_sector(state, offset + 1, §); |
258 | if (!l) | 255 | if (!l) |
259 | return; | 256 | return; |
260 | if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { | 257 | if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { |
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev, | |||
291 | } | 288 | } |
292 | #endif | 289 | #endif |
293 | 290 | ||
294 | static void | 291 | static void parse_freebsd(struct parsed_partitions *state, |
295 | parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, | 292 | sector_t offset, sector_t size, int origin) |
296 | sector_t offset, sector_t size, int origin) | ||
297 | { | 293 | { |
298 | #ifdef CONFIG_BSD_DISKLABEL | 294 | #ifdef CONFIG_BSD_DISKLABEL |
299 | parse_bsd(state, bdev, offset, size, origin, | 295 | parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); |
300 | "bsd", BSD_MAXPARTITIONS); | ||
301 | #endif | 296 | #endif |
302 | } | 297 | } |
303 | 298 | ||
304 | static void | 299 | static void parse_netbsd(struct parsed_partitions *state, |
305 | parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, | 300 | sector_t offset, sector_t size, int origin) |
306 | sector_t offset, sector_t size, int origin) | ||
307 | { | 301 | { |
308 | #ifdef CONFIG_BSD_DISKLABEL | 302 | #ifdef CONFIG_BSD_DISKLABEL |
309 | parse_bsd(state, bdev, offset, size, origin, | 303 | parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); |
310 | "netbsd", BSD_MAXPARTITIONS); | ||
311 | #endif | 304 | #endif |
312 | } | 305 | } |
313 | 306 | ||
314 | static void | 307 | static void parse_openbsd(struct parsed_partitions *state, |
315 | parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, | 308 | sector_t offset, sector_t size, int origin) |
316 | sector_t offset, sector_t size, int origin) | ||
317 | { | 309 | { |
318 | #ifdef CONFIG_BSD_DISKLABEL | 310 | #ifdef CONFIG_BSD_DISKLABEL |
319 | parse_bsd(state, bdev, offset, size, origin, | 311 | parse_bsd(state, offset, size, origin, "openbsd", |
320 | "openbsd", OPENBSD_MAXPARTITIONS); | 312 | OPENBSD_MAXPARTITIONS); |
321 | #endif | 313 | #endif |
322 | } | 314 | } |
323 | 315 | ||
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, | |||
325 | * Create devices for Unixware partitions listed in a disklabel, under a | 317 | * Create devices for Unixware partitions listed in a disklabel, under a |
326 | * dos-like partition. See parse_extended() for more information. | 318 | * dos-like partition. See parse_extended() for more information. |
327 | */ | 319 | */ |
328 | static void | 320 | static void parse_unixware(struct parsed_partitions *state, |
329 | parse_unixware(struct parsed_partitions *state, struct block_device *bdev, | 321 | sector_t offset, sector_t size, int origin) |
330 | sector_t offset, sector_t size, int origin) | ||
331 | { | 322 | { |
332 | #ifdef CONFIG_UNIXWARE_DISKLABEL | 323 | #ifdef CONFIG_UNIXWARE_DISKLABEL |
333 | Sector sect; | 324 | Sector sect; |
334 | struct unixware_disklabel *l; | 325 | struct unixware_disklabel *l; |
335 | struct unixware_slice *p; | 326 | struct unixware_slice *p; |
336 | 327 | ||
337 | l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, §); | 328 | l = read_part_sector(state, offset + 29, §); |
338 | if (!l) | 329 | if (!l) |
339 | return; | 330 | return; |
340 | if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || | 331 | if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || |
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev, | |||
365 | * Anand Krishnamurthy <anandk@wiproge.med.ge.com> | 356 | * Anand Krishnamurthy <anandk@wiproge.med.ge.com> |
366 | * Rajeev V. Pillai <rajeevvp@yahoo.com> | 357 | * Rajeev V. Pillai <rajeevvp@yahoo.com> |
367 | */ | 358 | */ |
368 | static void | 359 | static void parse_minix(struct parsed_partitions *state, |
369 | parse_minix(struct parsed_partitions *state, struct block_device *bdev, | 360 | sector_t offset, sector_t size, int origin) |
370 | sector_t offset, sector_t size, int origin) | ||
371 | { | 361 | { |
372 | #ifdef CONFIG_MINIX_SUBPARTITION | 362 | #ifdef CONFIG_MINIX_SUBPARTITION |
373 | Sector sect; | 363 | Sector sect; |
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, | |||
375 | struct partition *p; | 365 | struct partition *p; |
376 | int i; | 366 | int i; |
377 | 367 | ||
378 | data = read_dev_sector(bdev, offset, §); | 368 | data = read_part_sector(state, offset, §); |
379 | if (!data) | 369 | if (!data) |
380 | return; | 370 | return; |
381 | 371 | ||
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, | |||
404 | 394 | ||
405 | static struct { | 395 | static struct { |
406 | unsigned char id; | 396 | unsigned char id; |
407 | void (*parse)(struct parsed_partitions *, struct block_device *, | 397 | void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); |
408 | sector_t, sector_t, int); | ||
409 | } subtypes[] = { | 398 | } subtypes[] = { |
410 | {FREEBSD_PARTITION, parse_freebsd}, | 399 | {FREEBSD_PARTITION, parse_freebsd}, |
411 | {NETBSD_PARTITION, parse_netbsd}, | 400 | {NETBSD_PARTITION, parse_netbsd}, |
@@ -417,16 +406,16 @@ static struct { | |||
417 | {0, NULL}, | 406 | {0, NULL}, |
418 | }; | 407 | }; |
419 | 408 | ||
420 | int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) | 409 | int msdos_partition(struct parsed_partitions *state) |
421 | { | 410 | { |
422 | sector_t sector_size = bdev_logical_block_size(bdev) / 512; | 411 | sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; |
423 | Sector sect; | 412 | Sector sect; |
424 | unsigned char *data; | 413 | unsigned char *data; |
425 | struct partition *p; | 414 | struct partition *p; |
426 | struct fat_boot_sector *fb; | 415 | struct fat_boot_sector *fb; |
427 | int slot; | 416 | int slot; |
428 | 417 | ||
429 | data = read_dev_sector(bdev, 0, §); | 418 | data = read_part_sector(state, 0, §); |
430 | if (!data) | 419 | if (!data) |
431 | return -1; | 420 | return -1; |
432 | if (!msdos_magic_present(data + 510)) { | 421 | if (!msdos_magic_present(data + 510)) { |
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
434 | return 0; | 423 | return 0; |
435 | } | 424 | } |
436 | 425 | ||
437 | if (aix_magic_present(data, bdev)) { | 426 | if (aix_magic_present(state, data)) { |
438 | put_dev_sector(sect); | 427 | put_dev_sector(sect); |
439 | printk( " [AIX]"); | 428 | printk( " [AIX]"); |
440 | return 0; | 429 | return 0; |
@@ -503,7 +492,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
503 | put_partition(state, slot, start, n); | 492 | put_partition(state, slot, start, n); |
504 | 493 | ||
505 | printk(" <"); | 494 | printk(" <"); |
506 | parse_extended(state, bdev, start, size); | 495 | parse_extended(state, start, size); |
507 | printk(" >"); | 496 | printk(" >"); |
508 | continue; | 497 | continue; |
509 | } | 498 | } |
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
532 | 521 | ||
533 | if (!subtypes[n].parse) | 522 | if (!subtypes[n].parse) |
534 | continue; | 523 | continue; |
535 | subtypes[n].parse(state, bdev, start_sect(p)*sector_size, | 524 | subtypes[n].parse(state, start_sect(p) * sector_size, |
536 | nr_sects(p)*sector_size, slot); | 525 | nr_sects(p) * sector_size, slot); |
537 | } | 526 | } |
538 | put_dev_sector(sect); | 527 | put_dev_sector(sect); |
539 | return 1; | 528 | return 1; |
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h index 01e5e0b6902d..38c781c490b3 100644 --- a/fs/partitions/msdos.h +++ b/fs/partitions/msdos.h | |||
@@ -4,5 +4,5 @@ | |||
4 | 4 | ||
5 | #define MSDOS_LABEL_MAGIC 0xAA55 | 5 | #define MSDOS_LABEL_MAGIC 0xAA55 |
6 | 6 | ||
7 | int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); | 7 | int msdos_partition(struct parsed_partitions *state); |
8 | 8 | ||
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index c05c17bc5df3..fc22b85d436a 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c | |||
@@ -10,7 +10,7 @@ | |||
10 | #include "check.h" | 10 | #include "check.h" |
11 | #include "osf.h" | 11 | #include "osf.h" |
12 | 12 | ||
13 | int osf_partition(struct parsed_partitions *state, struct block_device *bdev) | 13 | int osf_partition(struct parsed_partitions *state) |
14 | { | 14 | { |
15 | int i; | 15 | int i; |
16 | int slot = 1; | 16 | int slot = 1; |
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
49 | } * label; | 49 | } * label; |
50 | struct d_partition * partition; | 50 | struct d_partition * partition; |
51 | 51 | ||
52 | data = read_dev_sector(bdev, 0, §); | 52 | data = read_part_sector(state, 0, §); |
53 | if (!data) | 53 | if (!data) |
54 | return -1; | 54 | return -1; |
55 | 55 | ||
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h index 427b8eab314b..20ed2315ec16 100644 --- a/fs/partitions/osf.h +++ b/fs/partitions/osf.h | |||
@@ -4,4 +4,4 @@ | |||
4 | 4 | ||
5 | #define DISKLABELMAGIC (0x82564557UL) | 5 | #define DISKLABELMAGIC (0x82564557UL) |
6 | 6 | ||
7 | int osf_partition(struct parsed_partitions *state, struct block_device *bdev); | 7 | int osf_partition(struct parsed_partitions *state); |
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c index ed5ac83fe83a..43b1df9aa16c 100644 --- a/fs/partitions/sgi.c +++ b/fs/partitions/sgi.c | |||
@@ -27,7 +27,7 @@ struct sgi_disklabel { | |||
27 | __be32 _unused1; /* Padding */ | 27 | __be32 _unused1; /* Padding */ |
28 | }; | 28 | }; |
29 | 29 | ||
30 | int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) | 30 | int sgi_partition(struct parsed_partitions *state) |
31 | { | 31 | { |
32 | int i, csum; | 32 | int i, csum; |
33 | __be32 magic; | 33 | __be32 magic; |
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
39 | struct sgi_partition *p; | 39 | struct sgi_partition *p; |
40 | char b[BDEVNAME_SIZE]; | 40 | char b[BDEVNAME_SIZE]; |
41 | 41 | ||
42 | label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, §); | 42 | label = read_part_sector(state, 0, §); |
43 | if (!label) | 43 | if (!label) |
44 | return -1; | 44 | return -1; |
45 | p = &label->partitions[0]; | 45 | p = &label->partitions[0]; |
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
57 | } | 57 | } |
58 | if(csum) { | 58 | if(csum) { |
59 | printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", | 59 | printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", |
60 | bdevname(bdev, b)); | 60 | bdevname(state->bdev, b)); |
61 | put_dev_sector(sect); | 61 | put_dev_sector(sect); |
62 | return 0; | 62 | return 0; |
63 | } | 63 | } |
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h index 5d5595c09928..b9553ebdd5a9 100644 --- a/fs/partitions/sgi.h +++ b/fs/partitions/sgi.h | |||
@@ -2,7 +2,7 @@ | |||
2 | * fs/partitions/sgi.h | 2 | * fs/partitions/sgi.h |
3 | */ | 3 | */ |
4 | 4 | ||
5 | extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); | 5 | extern int sgi_partition(struct parsed_partitions *state); |
6 | 6 | ||
7 | #define SGI_LABEL_MAGIC 0x0be5a941 | 7 | #define SGI_LABEL_MAGIC 0x0be5a941 |
8 | 8 | ||
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index c95e6a62c01d..a32660e25f7f 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c | |||
@@ -10,7 +10,7 @@ | |||
10 | #include "check.h" | 10 | #include "check.h" |
11 | #include "sun.h" | 11 | #include "sun.h" |
12 | 12 | ||
13 | int sun_partition(struct parsed_partitions *state, struct block_device *bdev) | 13 | int sun_partition(struct parsed_partitions *state) |
14 | { | 14 | { |
15 | int i; | 15 | int i; |
16 | __be16 csum; | 16 | __be16 csum; |
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
61 | int use_vtoc; | 61 | int use_vtoc; |
62 | int nparts; | 62 | int nparts; |
63 | 63 | ||
64 | label = (struct sun_disklabel *)read_dev_sector(bdev, 0, §); | 64 | label = read_part_sector(state, 0, §); |
65 | if (!label) | 65 | if (!label) |
66 | return -1; | 66 | return -1; |
67 | 67 | ||
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
78 | csum ^= *ush--; | 78 | csum ^= *ush--; |
79 | if (csum) { | 79 | if (csum) { |
80 | printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", | 80 | printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", |
81 | bdevname(bdev, b)); | 81 | bdevname(state->bdev, b)); |
82 | put_dev_sector(sect); | 82 | put_dev_sector(sect); |
83 | return 0; | 83 | return 0; |
84 | } | 84 | } |
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h index 7f864d1f86d4..2424baa8319f 100644 --- a/fs/partitions/sun.h +++ b/fs/partitions/sun.h | |||
@@ -5,4 +5,4 @@ | |||
5 | #define SUN_LABEL_MAGIC 0xDABE | 5 | #define SUN_LABEL_MAGIC 0xDABE |
6 | #define SUN_VTOC_SANITY 0x600DDEEE | 6 | #define SUN_VTOC_SANITY 0x600DDEEE |
7 | 7 | ||
8 | int sun_partition(struct parsed_partitions *state, struct block_device *bdev); | 8 | int sun_partition(struct parsed_partitions *state); |
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c index 4eba27b78643..9030c864428e 100644 --- a/fs/partitions/sysv68.c +++ b/fs/partitions/sysv68.c | |||
@@ -46,7 +46,7 @@ struct slice { | |||
46 | }; | 46 | }; |
47 | 47 | ||
48 | 48 | ||
49 | int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) | 49 | int sysv68_partition(struct parsed_partitions *state) |
50 | { | 50 | { |
51 | int i, slices; | 51 | int i, slices; |
52 | int slot = 1; | 52 | int slot = 1; |
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
55 | struct dkblk0 *b; | 55 | struct dkblk0 *b; |
56 | struct slice *slice; | 56 | struct slice *slice; |
57 | 57 | ||
58 | data = read_dev_sector(bdev, 0, §); | 58 | data = read_part_sector(state, 0, §); |
59 | if (!data) | 59 | if (!data) |
60 | return -1; | 60 | return -1; |
61 | 61 | ||
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
68 | i = be32_to_cpu(b->dk_ios.ios_slcblk); | 68 | i = be32_to_cpu(b->dk_ios.ios_slcblk); |
69 | put_dev_sector(sect); | 69 | put_dev_sector(sect); |
70 | 70 | ||
71 | data = read_dev_sector(bdev, i, §); | 71 | data = read_part_sector(state, i, §); |
72 | if (!data) | 72 | if (!data) |
73 | return -1; | 73 | return -1; |
74 | 74 | ||
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h index fa733f68431b..bf2f5ffa97ac 100644 --- a/fs/partitions/sysv68.h +++ b/fs/partitions/sysv68.h | |||
@@ -1 +1 @@ | |||
extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); | extern int sysv68_partition(struct parsed_partitions *state); | ||
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c index ec852c11dce4..db9eef260364 100644 --- a/fs/partitions/ultrix.c +++ b/fs/partitions/ultrix.c | |||
@@ -9,7 +9,7 @@ | |||
9 | #include "check.h" | 9 | #include "check.h" |
10 | #include "ultrix.h" | 10 | #include "ultrix.h" |
11 | 11 | ||
12 | int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) | 12 | int ultrix_partition(struct parsed_partitions *state) |
13 | { | 13 | { |
14 | int i; | 14 | int i; |
15 | Sector sect; | 15 | Sector sect; |
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) | |||
26 | #define PT_MAGIC 0x032957 /* Partition magic number */ | 26 | #define PT_MAGIC 0x032957 /* Partition magic number */ |
27 | #define PT_VALID 1 /* Indicates if struct is valid */ | 27 | #define PT_VALID 1 /* Indicates if struct is valid */ |
28 | 28 | ||
29 | data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, §); | 29 | data = read_part_sector(state, (16384 - sizeof(*label))/512, §); |
30 | if (!data) | 30 | if (!data) |
31 | return -1; | 31 | return -1; |
32 | 32 | ||
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h index a74bf8e2d370..a3cc00b2bded 100644 --- a/fs/partitions/ultrix.h +++ b/fs/partitions/ultrix.h | |||
@@ -2,4 +2,4 @@ | |||
2 | * fs/partitions/ultrix.h | 2 | * fs/partitions/ultrix.h |
3 | */ | 3 | */ |
4 | 4 | ||
5 | int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); | 5 | int ultrix_partition(struct parsed_partitions *state); |
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
14 | #include <linux/log2.h> | ||
14 | #include <linux/mount.h> | 15 | #include <linux/mount.h> |
15 | #include <linux/pipe_fs_i.h> | 16 | #include <linux/pipe_fs_i.h> |
16 | #include <linux/uio.h> | 17 | #include <linux/uio.h> |
@@ -18,11 +19,18 @@ | |||
18 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
19 | #include <linux/audit.h> | 20 | #include <linux/audit.h> |
20 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | #include <linux/fcntl.h> | ||
21 | 23 | ||
22 | #include <asm/uaccess.h> | 24 | #include <asm/uaccess.h> |
23 | #include <asm/ioctls.h> | 25 | #include <asm/ioctls.h> |
24 | 26 | ||
25 | /* | 27 | /* |
28 | * The max size that a non-root user is allowed to grow the pipe. Can | ||
29 | * be set by root in /proc/sys/fs/pipe-max-pages | ||
30 | */ | ||
31 | unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; | ||
32 | |||
33 | /* | ||
26 | * We use a start+len construction, which provides full use of the | 34 | * We use a start+len construction, which provides full use of the |
27 | * allocated memory. | 35 | * allocated memory. |
28 | * -- Florian Coosmann (FGC) | 36 | * -- Florian Coosmann (FGC) |
@@ -390,7 +398,7 @@ redo: | |||
390 | if (!buf->len) { | 398 | if (!buf->len) { |
391 | buf->ops = NULL; | 399 | buf->ops = NULL; |
392 | ops->release(pipe, buf); | 400 | ops->release(pipe, buf); |
393 | curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); | 401 | curbuf = (curbuf + 1) & (pipe->buffers - 1); |
394 | pipe->curbuf = curbuf; | 402 | pipe->curbuf = curbuf; |
395 | pipe->nrbufs = --bufs; | 403 | pipe->nrbufs = --bufs; |
396 | do_wakeup = 1; | 404 | do_wakeup = 1; |
@@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, | |||
472 | chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ | 480 | chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ |
473 | if (pipe->nrbufs && chars != 0) { | 481 | if (pipe->nrbufs && chars != 0) { |
474 | int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & | 482 | int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & |
475 | (PIPE_BUFFERS-1); | 483 | (pipe->buffers - 1); |
476 | struct pipe_buffer *buf = pipe->bufs + lastbuf; | 484 | struct pipe_buffer *buf = pipe->bufs + lastbuf; |
477 | const struct pipe_buf_operations *ops = buf->ops; | 485 | const struct pipe_buf_operations *ops = buf->ops; |
478 | int offset = buf->offset + buf->len; | 486 | int offset = buf->offset + buf->len; |
@@ -518,8 +526,8 @@ redo1: | |||
518 | break; | 526 | break; |
519 | } | 527 | } |
520 | bufs = pipe->nrbufs; | 528 | bufs = pipe->nrbufs; |
521 | if (bufs < PIPE_BUFFERS) { | 529 | if (bufs < pipe->buffers) { |
522 | int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); | 530 | int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); |
523 | struct pipe_buffer *buf = pipe->bufs + newbuf; | 531 | struct pipe_buffer *buf = pipe->bufs + newbuf; |
524 | struct page *page = pipe->tmp_page; | 532 | struct page *page = pipe->tmp_page; |
525 | char *src; | 533 | char *src; |
@@ -580,7 +588,7 @@ redo2: | |||
580 | if (!total_len) | 588 | if (!total_len) |
581 | break; | 589 | break; |
582 | } | 590 | } |
583 | if (bufs < PIPE_BUFFERS) | 591 | if (bufs < pipe->buffers) |
584 | continue; | 592 | continue; |
585 | if (filp->f_flags & O_NONBLOCK) { | 593 | if (filp->f_flags & O_NONBLOCK) { |
586 | if (!ret) | 594 | if (!ret) |
@@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
640 | nrbufs = pipe->nrbufs; | 648 | nrbufs = pipe->nrbufs; |
641 | while (--nrbufs >= 0) { | 649 | while (--nrbufs >= 0) { |
642 | count += pipe->bufs[buf].len; | 650 | count += pipe->bufs[buf].len; |
643 | buf = (buf+1) & (PIPE_BUFFERS-1); | 651 | buf = (buf+1) & (pipe->buffers - 1); |
644 | } | 652 | } |
645 | mutex_unlock(&inode->i_mutex); | 653 | mutex_unlock(&inode->i_mutex); |
646 | 654 | ||
@@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait) | |||
671 | } | 679 | } |
672 | 680 | ||
673 | if (filp->f_mode & FMODE_WRITE) { | 681 | if (filp->f_mode & FMODE_WRITE) { |
674 | mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; | 682 | mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; |
675 | /* | 683 | /* |
676 | * Most Unices do not set POLLERR for FIFOs but on Linux they | 684 | * Most Unices do not set POLLERR for FIFOs but on Linux they |
677 | * behave exactly like pipes for poll(). | 685 | * behave exactly like pipes for poll(). |
@@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) | |||
877 | 885 | ||
878 | pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); | 886 | pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); |
879 | if (pipe) { | 887 | if (pipe) { |
880 | init_waitqueue_head(&pipe->wait); | 888 | pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); |
881 | pipe->r_counter = pipe->w_counter = 1; | 889 | if (pipe->bufs) { |
882 | pipe->inode = inode; | 890 | init_waitqueue_head(&pipe->wait); |
891 | pipe->r_counter = pipe->w_counter = 1; | ||
892 | pipe->inode = inode; | ||
893 | pipe->buffers = PIPE_DEF_BUFFERS; | ||
894 | return pipe; | ||
895 | } | ||
896 | kfree(pipe); | ||
883 | } | 897 | } |
884 | 898 | ||
885 | return pipe; | 899 | return NULL; |
886 | } | 900 | } |
887 | 901 | ||
888 | void __free_pipe_info(struct pipe_inode_info *pipe) | 902 | void __free_pipe_info(struct pipe_inode_info *pipe) |
889 | { | 903 | { |
890 | int i; | 904 | int i; |
891 | 905 | ||
892 | for (i = 0; i < PIPE_BUFFERS; i++) { | 906 | for (i = 0; i < pipe->buffers; i++) { |
893 | struct pipe_buffer *buf = pipe->bufs + i; | 907 | struct pipe_buffer *buf = pipe->bufs + i; |
894 | if (buf->ops) | 908 | if (buf->ops) |
895 | buf->ops->release(pipe, buf); | 909 | buf->ops->release(pipe, buf); |
896 | } | 910 | } |
897 | if (pipe->tmp_page) | 911 | if (pipe->tmp_page) |
898 | __free_page(pipe->tmp_page); | 912 | __free_page(pipe->tmp_page); |
913 | kfree(pipe->bufs); | ||
899 | kfree(pipe); | 914 | kfree(pipe); |
900 | } | 915 | } |
901 | 916 | ||
@@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) | |||
1094 | } | 1109 | } |
1095 | 1110 | ||
1096 | /* | 1111 | /* |
1112 | * Allocate a new array of pipe buffers and copy the info over. Returns the | ||
1113 | * pipe size if successful, or return -ERROR on error. | ||
1114 | */ | ||
1115 | static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) | ||
1116 | { | ||
1117 | struct pipe_buffer *bufs; | ||
1118 | |||
1119 | /* | ||
1120 | * Must be a power-of-2 currently | ||
1121 | */ | ||
1122 | if (!is_power_of_2(arg)) | ||
1123 | return -EINVAL; | ||
1124 | |||
1125 | /* | ||
1126 | * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't | ||
1127 | * expect a lot of shrink+grow operations, just free and allocate | ||
1128 | * again like we would do for growing. If the pipe currently | ||
1129 | * contains more buffers than arg, then return busy. | ||
1130 | */ | ||
1131 | if (arg < pipe->nrbufs) | ||
1132 | return -EBUSY; | ||
1133 | |||
1134 | bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); | ||
1135 | if (unlikely(!bufs)) | ||
1136 | return -ENOMEM; | ||
1137 | |||
1138 | /* | ||
1139 | * The pipe array wraps around, so just start the new one at zero | ||
1140 | * and adjust the indexes. | ||
1141 | */ | ||
1142 | if (pipe->nrbufs) { | ||
1143 | const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); | ||
1144 | const unsigned int head = pipe->nrbufs - tail; | ||
1145 | |||
1146 | if (head) | ||
1147 | memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); | ||
1148 | if (tail) | ||
1149 | memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); | ||
1150 | } | ||
1151 | |||
1152 | pipe->curbuf = 0; | ||
1153 | kfree(pipe->bufs); | ||
1154 | pipe->bufs = bufs; | ||
1155 | pipe->buffers = arg; | ||
1156 | return arg; | ||
1157 | } | ||
1158 | |||
1159 | long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) | ||
1160 | { | ||
1161 | struct pipe_inode_info *pipe; | ||
1162 | long ret; | ||
1163 | |||
1164 | pipe = file->f_path.dentry->d_inode->i_pipe; | ||
1165 | if (!pipe) | ||
1166 | return -EBADF; | ||
1167 | |||
1168 | mutex_lock(&pipe->inode->i_mutex); | ||
1169 | |||
1170 | switch (cmd) { | ||
1171 | case F_SETPIPE_SZ: | ||
1172 | if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) | ||
1173 | return -EINVAL; | ||
1174 | /* | ||
1175 | * The pipe needs to be at least 2 pages large to | ||
1176 | * guarantee POSIX behaviour. | ||
1177 | */ | ||
1178 | if (arg < 2) | ||
1179 | return -EINVAL; | ||
1180 | ret = pipe_set_size(pipe, arg); | ||
1181 | break; | ||
1182 | case F_GETPIPE_SZ: | ||
1183 | ret = pipe->buffers; | ||
1184 | break; | ||
1185 | default: | ||
1186 | ret = -EINVAL; | ||
1187 | break; | ||
1188 | } | ||
1189 | |||
1190 | mutex_unlock(&pipe->inode->i_mutex); | ||
1191 | return ret; | ||
1192 | } | ||
1193 | |||
1194 | /* | ||
1097 | * pipefs should _never_ be mounted by userland - too much of security hassle, | 1195 | * pipefs should _never_ be mounted by userland - too much of security hassle, |
1098 | * no real gain from having the whole whorehouse mounted. So we don't need | 1196 | * no real gain from having the whole whorehouse mounted. So we don't need |
1099 | * any operations on the root directory. However, we need a non-trivial | 1197 | * any operations on the root directory. However, we need a non-trivial |
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1d9c12714c5c..9977df9f3a54 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c | |||
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp, | |||
147 | barrier_done = reiserfs_commit_for_inode(inode); | 147 | barrier_done = reiserfs_commit_for_inode(inode); |
148 | reiserfs_write_unlock(inode->i_sb); | 148 | reiserfs_write_unlock(inode->i_sb); |
149 | if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) | 149 | if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) |
150 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 150 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, |
151 | BLKDEV_IFL_WAIT); | ||
151 | if (barrier_done < 0) | 152 | if (barrier_done < 0) |
152 | return barrier_done; | 153 | return barrier_done; |
153 | return (err < 0) ? -EIO : 0; | 154 | return (err < 0) ? -EIO : 0; |
diff --git a/fs/splice.c b/fs/splice.c index 9313b6124a2e..ac22b00d86c3 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, | |||
193 | break; | 193 | break; |
194 | } | 194 | } |
195 | 195 | ||
196 | if (pipe->nrbufs < PIPE_BUFFERS) { | 196 | if (pipe->nrbufs < pipe->buffers) { |
197 | int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); | 197 | int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); |
198 | struct pipe_buffer *buf = pipe->bufs + newbuf; | 198 | struct pipe_buffer *buf = pipe->bufs + newbuf; |
199 | 199 | ||
200 | buf->page = spd->pages[page_nr]; | 200 | buf->page = spd->pages[page_nr]; |
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, | |||
214 | 214 | ||
215 | if (!--spd->nr_pages) | 215 | if (!--spd->nr_pages) |
216 | break; | 216 | break; |
217 | if (pipe->nrbufs < PIPE_BUFFERS) | 217 | if (pipe->nrbufs < pipe->buffers) |
218 | continue; | 218 | continue; |
219 | 219 | ||
220 | break; | 220 | break; |
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) | |||
265 | page_cache_release(spd->pages[i]); | 265 | page_cache_release(spd->pages[i]); |
266 | } | 266 | } |
267 | 267 | ||
268 | /* | ||
269 | * Check if we need to grow the arrays holding pages and partial page | ||
270 | * descriptions. | ||
271 | */ | ||
272 | int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) | ||
273 | { | ||
274 | if (pipe->buffers <= PIPE_DEF_BUFFERS) | ||
275 | return 0; | ||
276 | |||
277 | spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); | ||
278 | spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); | ||
279 | |||
280 | if (spd->pages && spd->partial) | ||
281 | return 0; | ||
282 | |||
283 | kfree(spd->pages); | ||
284 | kfree(spd->partial); | ||
285 | return -ENOMEM; | ||
286 | } | ||
287 | |||
288 | void splice_shrink_spd(struct pipe_inode_info *pipe, | ||
289 | struct splice_pipe_desc *spd) | ||
290 | { | ||
291 | if (pipe->buffers <= PIPE_DEF_BUFFERS) | ||
292 | return; | ||
293 | |||
294 | kfree(spd->pages); | ||
295 | kfree(spd->partial); | ||
296 | } | ||
297 | |||
268 | static int | 298 | static int |
269 | __generic_file_splice_read(struct file *in, loff_t *ppos, | 299 | __generic_file_splice_read(struct file *in, loff_t *ppos, |
270 | struct pipe_inode_info *pipe, size_t len, | 300 | struct pipe_inode_info *pipe, size_t len, |
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, | |||
272 | { | 302 | { |
273 | struct address_space *mapping = in->f_mapping; | 303 | struct address_space *mapping = in->f_mapping; |
274 | unsigned int loff, nr_pages, req_pages; | 304 | unsigned int loff, nr_pages, req_pages; |
275 | struct page *pages[PIPE_BUFFERS]; | 305 | struct page *pages[PIPE_DEF_BUFFERS]; |
276 | struct partial_page partial[PIPE_BUFFERS]; | 306 | struct partial_page partial[PIPE_DEF_BUFFERS]; |
277 | struct page *page; | 307 | struct page *page; |
278 | pgoff_t index, end_index; | 308 | pgoff_t index, end_index; |
279 | loff_t isize; | 309 | loff_t isize; |
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, | |||
286 | .spd_release = spd_release_page, | 316 | .spd_release = spd_release_page, |
287 | }; | 317 | }; |
288 | 318 | ||
319 | if (splice_grow_spd(pipe, &spd)) | ||
320 | return -ENOMEM; | ||
321 | |||
289 | index = *ppos >> PAGE_CACHE_SHIFT; | 322 | index = *ppos >> PAGE_CACHE_SHIFT; |
290 | loff = *ppos & ~PAGE_CACHE_MASK; | 323 | loff = *ppos & ~PAGE_CACHE_MASK; |
291 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 324 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
292 | nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); | 325 | nr_pages = min(req_pages, pipe->buffers); |
293 | 326 | ||
294 | /* | 327 | /* |
295 | * Lookup the (hopefully) full range of pages we need. | 328 | * Lookup the (hopefully) full range of pages we need. |
296 | */ | 329 | */ |
297 | spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); | 330 | spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); |
298 | index += spd.nr_pages; | 331 | index += spd.nr_pages; |
299 | 332 | ||
300 | /* | 333 | /* |
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, | |||
335 | unlock_page(page); | 368 | unlock_page(page); |
336 | } | 369 | } |
337 | 370 | ||
338 | pages[spd.nr_pages++] = page; | 371 | spd.pages[spd.nr_pages++] = page; |
339 | index++; | 372 | index++; |
340 | } | 373 | } |
341 | 374 | ||
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, | |||
356 | * this_len is the max we'll use from this page | 389 | * this_len is the max we'll use from this page |
357 | */ | 390 | */ |
358 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); | 391 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); |
359 | page = pages[page_nr]; | 392 | page = spd.pages[page_nr]; |
360 | 393 | ||
361 | if (PageReadahead(page)) | 394 | if (PageReadahead(page)) |
362 | page_cache_async_readahead(mapping, &in->f_ra, in, | 395 | page_cache_async_readahead(mapping, &in->f_ra, in, |
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, | |||
393 | error = -ENOMEM; | 426 | error = -ENOMEM; |
394 | break; | 427 | break; |
395 | } | 428 | } |
396 | page_cache_release(pages[page_nr]); | 429 | page_cache_release(spd.pages[page_nr]); |
397 | pages[page_nr] = page; | 430 | spd.pages[page_nr] = page; |
398 | } | 431 | } |
399 | /* | 432 | /* |
400 | * page was already under io and is now done, great | 433 | * page was already under io and is now done, great |
@@ -451,8 +484,8 @@ fill_it: | |||
451 | len = this_len; | 484 | len = this_len; |
452 | } | 485 | } |
453 | 486 | ||
454 | partial[page_nr].offset = loff; | 487 | spd.partial[page_nr].offset = loff; |
455 | partial[page_nr].len = this_len; | 488 | spd.partial[page_nr].len = this_len; |
456 | len -= this_len; | 489 | len -= this_len; |
457 | loff = 0; | 490 | loff = 0; |
458 | spd.nr_pages++; | 491 | spd.nr_pages++; |
@@ -464,12 +497,13 @@ fill_it: | |||
464 | * we got, 'nr_pages' is how many pages are in the map. | 497 | * we got, 'nr_pages' is how many pages are in the map. |
465 | */ | 498 | */ |
466 | while (page_nr < nr_pages) | 499 | while (page_nr < nr_pages) |
467 | page_cache_release(pages[page_nr++]); | 500 | page_cache_release(spd.pages[page_nr++]); |
468 | in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; | 501 | in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; |
469 | 502 | ||
470 | if (spd.nr_pages) | 503 | if (spd.nr_pages) |
471 | return splice_to_pipe(pipe, &spd); | 504 | error = splice_to_pipe(pipe, &spd); |
472 | 505 | ||
506 | splice_shrink_spd(pipe, &spd); | ||
473 | return error; | 507 | return error; |
474 | } | 508 | } |
475 | 509 | ||
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, | |||
560 | unsigned int nr_pages; | 594 | unsigned int nr_pages; |
561 | unsigned int nr_freed; | 595 | unsigned int nr_freed; |
562 | size_t offset; | 596 | size_t offset; |
563 | struct page *pages[PIPE_BUFFERS]; | 597 | struct page *pages[PIPE_DEF_BUFFERS]; |
564 | struct partial_page partial[PIPE_BUFFERS]; | 598 | struct partial_page partial[PIPE_DEF_BUFFERS]; |
565 | struct iovec vec[PIPE_BUFFERS]; | 599 | struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; |
566 | pgoff_t index; | 600 | pgoff_t index; |
567 | ssize_t res; | 601 | ssize_t res; |
568 | size_t this_len; | 602 | size_t this_len; |
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, | |||
576 | .spd_release = spd_release_page, | 610 | .spd_release = spd_release_page, |
577 | }; | 611 | }; |
578 | 612 | ||
613 | if (splice_grow_spd(pipe, &spd)) | ||
614 | return -ENOMEM; | ||
615 | |||
616 | res = -ENOMEM; | ||
617 | vec = __vec; | ||
618 | if (pipe->buffers > PIPE_DEF_BUFFERS) { | ||
619 | vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); | ||
620 | if (!vec) | ||
621 | goto shrink_ret; | ||
622 | } | ||
623 | |||
579 | index = *ppos >> PAGE_CACHE_SHIFT; | 624 | index = *ppos >> PAGE_CACHE_SHIFT; |
580 | offset = *ppos & ~PAGE_CACHE_MASK; | 625 | offset = *ppos & ~PAGE_CACHE_MASK; |
581 | nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 626 | nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
582 | 627 | ||
583 | for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { | 628 | for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { |
584 | struct page *page; | 629 | struct page *page; |
585 | 630 | ||
586 | page = alloc_page(GFP_USER); | 631 | page = alloc_page(GFP_USER); |
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, | |||
591 | this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); | 636 | this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); |
592 | vec[i].iov_base = (void __user *) page_address(page); | 637 | vec[i].iov_base = (void __user *) page_address(page); |
593 | vec[i].iov_len = this_len; | 638 | vec[i].iov_len = this_len; |
594 | pages[i] = page; | 639 | spd.pages[i] = page; |
595 | spd.nr_pages++; | 640 | spd.nr_pages++; |
596 | len -= this_len; | 641 | len -= this_len; |
597 | offset = 0; | 642 | offset = 0; |
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, | |||
610 | nr_freed = 0; | 655 | nr_freed = 0; |
611 | for (i = 0; i < spd.nr_pages; i++) { | 656 | for (i = 0; i < spd.nr_pages; i++) { |
612 | this_len = min_t(size_t, vec[i].iov_len, res); | 657 | this_len = min_t(size_t, vec[i].iov_len, res); |
613 | partial[i].offset = 0; | 658 | spd.partial[i].offset = 0; |
614 | partial[i].len = this_len; | 659 | spd.partial[i].len = this_len; |
615 | if (!this_len) { | 660 | if (!this_len) { |
616 | __free_page(pages[i]); | 661 | __free_page(spd.pages[i]); |
617 | pages[i] = NULL; | 662 | spd.pages[i] = NULL; |
618 | nr_freed++; | 663 | nr_freed++; |
619 | } | 664 | } |
620 | res -= this_len; | 665 | res -= this_len; |
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, | |||
625 | if (res > 0) | 670 | if (res > 0) |
626 | *ppos += res; | 671 | *ppos += res; |
627 | 672 | ||
673 | shrink_ret: | ||
674 | if (vec != __vec) | ||
675 | kfree(vec); | ||
676 | splice_shrink_spd(pipe, &spd); | ||
628 | return res; | 677 | return res; |
629 | 678 | ||
630 | err: | 679 | err: |
631 | for (i = 0; i < spd.nr_pages; i++) | 680 | for (i = 0; i < spd.nr_pages; i++) |
632 | __free_page(pages[i]); | 681 | __free_page(spd.pages[i]); |
633 | 682 | ||
634 | return error; | 683 | res = error; |
684 | goto shrink_ret; | ||
635 | } | 685 | } |
636 | EXPORT_SYMBOL(default_file_splice_read); | 686 | EXPORT_SYMBOL(default_file_splice_read); |
637 | 687 | ||
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, | |||
784 | if (!buf->len) { | 834 | if (!buf->len) { |
785 | buf->ops = NULL; | 835 | buf->ops = NULL; |
786 | ops->release(pipe, buf); | 836 | ops->release(pipe, buf); |
787 | pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); | 837 | pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); |
788 | pipe->nrbufs--; | 838 | pipe->nrbufs--; |
789 | if (pipe->inode) | 839 | if (pipe->inode) |
790 | sd->need_wakeup = true; | 840 | sd->need_wakeup = true; |
@@ -1211,7 +1261,7 @@ out_release: | |||
1211 | * If we did an incomplete transfer we must release | 1261 | * If we did an incomplete transfer we must release |
1212 | * the pipe buffers in question: | 1262 | * the pipe buffers in question: |
1213 | */ | 1263 | */ |
1214 | for (i = 0; i < PIPE_BUFFERS; i++) { | 1264 | for (i = 0; i < pipe->buffers; i++) { |
1215 | struct pipe_buffer *buf = pipe->bufs + i; | 1265 | struct pipe_buffer *buf = pipe->bufs + i; |
1216 | 1266 | ||
1217 | if (buf->ops) { | 1267 | if (buf->ops) { |
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, | |||
1371 | */ | 1421 | */ |
1372 | static int get_iovec_page_array(const struct iovec __user *iov, | 1422 | static int get_iovec_page_array(const struct iovec __user *iov, |
1373 | unsigned int nr_vecs, struct page **pages, | 1423 | unsigned int nr_vecs, struct page **pages, |
1374 | struct partial_page *partial, int aligned) | 1424 | struct partial_page *partial, int aligned, |
1425 | unsigned int pipe_buffers) | ||
1375 | { | 1426 | { |
1376 | int buffers = 0, error = 0; | 1427 | int buffers = 0, error = 0; |
1377 | 1428 | ||
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, | |||
1414 | break; | 1465 | break; |
1415 | 1466 | ||
1416 | npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1467 | npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
1417 | if (npages > PIPE_BUFFERS - buffers) | 1468 | if (npages > pipe_buffers - buffers) |
1418 | npages = PIPE_BUFFERS - buffers; | 1469 | npages = pipe_buffers - buffers; |
1419 | 1470 | ||
1420 | error = get_user_pages_fast((unsigned long)base, npages, | 1471 | error = get_user_pages_fast((unsigned long)base, npages, |
1421 | 0, &pages[buffers]); | 1472 | 0, &pages[buffers]); |
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, | |||
1450 | * or if we mapped the max number of pages that we have | 1501 | * or if we mapped the max number of pages that we have |
1451 | * room for. | 1502 | * room for. |
1452 | */ | 1503 | */ |
1453 | if (error < npages || buffers == PIPE_BUFFERS) | 1504 | if (error < npages || buffers == pipe_buffers) |
1454 | break; | 1505 | break; |
1455 | 1506 | ||
1456 | nr_vecs--; | 1507 | nr_vecs--; |
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, | |||
1593 | unsigned long nr_segs, unsigned int flags) | 1644 | unsigned long nr_segs, unsigned int flags) |
1594 | { | 1645 | { |
1595 | struct pipe_inode_info *pipe; | 1646 | struct pipe_inode_info *pipe; |
1596 | struct page *pages[PIPE_BUFFERS]; | 1647 | struct page *pages[PIPE_DEF_BUFFERS]; |
1597 | struct partial_page partial[PIPE_BUFFERS]; | 1648 | struct partial_page partial[PIPE_DEF_BUFFERS]; |
1598 | struct splice_pipe_desc spd = { | 1649 | struct splice_pipe_desc spd = { |
1599 | .pages = pages, | 1650 | .pages = pages, |
1600 | .partial = partial, | 1651 | .partial = partial, |
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, | |||
1602 | .ops = &user_page_pipe_buf_ops, | 1653 | .ops = &user_page_pipe_buf_ops, |
1603 | .spd_release = spd_release_page, | 1654 | .spd_release = spd_release_page, |
1604 | }; | 1655 | }; |
1656 | long ret; | ||
1605 | 1657 | ||
1606 | pipe = pipe_info(file->f_path.dentry->d_inode); | 1658 | pipe = pipe_info(file->f_path.dentry->d_inode); |
1607 | if (!pipe) | 1659 | if (!pipe) |
1608 | return -EBADF; | 1660 | return -EBADF; |
1609 | 1661 | ||
1610 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, | 1662 | if (splice_grow_spd(pipe, &spd)) |
1611 | flags & SPLICE_F_GIFT); | 1663 | return -ENOMEM; |
1664 | |||
1665 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, | ||
1666 | spd.partial, flags & SPLICE_F_GIFT, | ||
1667 | pipe->buffers); | ||
1612 | if (spd.nr_pages <= 0) | 1668 | if (spd.nr_pages <= 0) |
1613 | return spd.nr_pages; | 1669 | ret = spd.nr_pages; |
1670 | else | ||
1671 | ret = splice_to_pipe(pipe, &spd); | ||
1614 | 1672 | ||
1615 | return splice_to_pipe(pipe, &spd); | 1673 | splice_shrink_spd(pipe, &spd); |
1674 | return ret; | ||
1616 | } | 1675 | } |
1617 | 1676 | ||
1618 | /* | 1677 | /* |
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | |||
1738 | * Check ->nrbufs without the inode lock first. This function | 1797 | * Check ->nrbufs without the inode lock first. This function |
1739 | * is speculative anyways, so missing one is ok. | 1798 | * is speculative anyways, so missing one is ok. |
1740 | */ | 1799 | */ |
1741 | if (pipe->nrbufs < PIPE_BUFFERS) | 1800 | if (pipe->nrbufs < pipe->buffers) |
1742 | return 0; | 1801 | return 0; |
1743 | 1802 | ||
1744 | ret = 0; | 1803 | ret = 0; |
1745 | pipe_lock(pipe); | 1804 | pipe_lock(pipe); |
1746 | 1805 | ||
1747 | while (pipe->nrbufs >= PIPE_BUFFERS) { | 1806 | while (pipe->nrbufs >= pipe->buffers) { |
1748 | if (!pipe->readers) { | 1807 | if (!pipe->readers) { |
1749 | send_sig(SIGPIPE, current, 0); | 1808 | send_sig(SIGPIPE, current, 0); |
1750 | ret = -EPIPE; | 1809 | ret = -EPIPE; |
@@ -1810,7 +1869,7 @@ retry: | |||
1810 | * Cannot make any progress, because either the input | 1869 | * Cannot make any progress, because either the input |
1811 | * pipe is empty or the output pipe is full. | 1870 | * pipe is empty or the output pipe is full. |
1812 | */ | 1871 | */ |
1813 | if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { | 1872 | if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { |
1814 | /* Already processed some buffers, break */ | 1873 | /* Already processed some buffers, break */ |
1815 | if (ret) | 1874 | if (ret) |
1816 | break; | 1875 | break; |
@@ -1831,7 +1890,7 @@ retry: | |||
1831 | } | 1890 | } |
1832 | 1891 | ||
1833 | ibuf = ipipe->bufs + ipipe->curbuf; | 1892 | ibuf = ipipe->bufs + ipipe->curbuf; |
1834 | nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; | 1893 | nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); |
1835 | obuf = opipe->bufs + nbuf; | 1894 | obuf = opipe->bufs + nbuf; |
1836 | 1895 | ||
1837 | if (len >= ibuf->len) { | 1896 | if (len >= ibuf->len) { |
@@ -1841,7 +1900,7 @@ retry: | |||
1841 | *obuf = *ibuf; | 1900 | *obuf = *ibuf; |
1842 | ibuf->ops = NULL; | 1901 | ibuf->ops = NULL; |
1843 | opipe->nrbufs++; | 1902 | opipe->nrbufs++; |
1844 | ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; | 1903 | ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); |
1845 | ipipe->nrbufs--; | 1904 | ipipe->nrbufs--; |
1846 | input_wakeup = true; | 1905 | input_wakeup = true; |
1847 | } else { | 1906 | } else { |
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, | |||
1914 | * If we have iterated all input buffers or ran out of | 1973 | * If we have iterated all input buffers or ran out of |
1915 | * output room, break. | 1974 | * output room, break. |
1916 | */ | 1975 | */ |
1917 | if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) | 1976 | if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) |
1918 | break; | 1977 | break; |
1919 | 1978 | ||
1920 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); | 1979 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); |
1921 | nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); | 1980 | nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); |
1922 | 1981 | ||
1923 | /* | 1982 | /* |
1924 | * Get a reference to this pipe buffer, | 1983 | * Get a reference to this pipe buffer, |
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) | |||
42 | if (wait) | 42 | if (wait) |
43 | sync_inodes_sb(sb); | 43 | sync_inodes_sb(sb); |
44 | else | 44 | else |
45 | writeback_inodes_sb(sb); | 45 | writeback_inodes_sb_locked(sb); |
46 | 46 | ||
47 | if (sb->s_op->sync_fs) | 47 | if (sb->s_op->sync_fs) |
48 | sb->s_op->sync_fs(sb, wait); | 48 | sb->s_op->sync_fs(sb, wait); |
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index e9002513e08f..f24dbe5efde3 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c | |||
@@ -725,7 +725,8 @@ void | |||
725 | xfs_blkdev_issue_flush( | 725 | xfs_blkdev_issue_flush( |
726 | xfs_buftarg_t *buftarg) | 726 | xfs_buftarg_t *buftarg) |
727 | { | 727 | { |
728 | blkdev_issue_flush(buftarg->bt_bdev, NULL); | 728 | blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, |
729 | BLKDEV_IFL_WAIT); | ||
729 | } | 730 | } |
730 | 731 | ||
731 | STATIC void | 732 | STATIC void |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index bd0e3c6f323f..e6e0cb5437e6 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/timer.h> | ||
17 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
18 | #include <asm/atomic.h> | 19 | #include <asm/atomic.h> |
19 | 20 | ||
@@ -88,6 +89,8 @@ struct backing_dev_info { | |||
88 | 89 | ||
89 | struct device *dev; | 90 | struct device *dev; |
90 | 91 | ||
92 | struct timer_list laptop_mode_wb_timer; | ||
93 | |||
91 | #ifdef CONFIG_DEBUG_FS | 94 | #ifdef CONFIG_DEBUG_FS |
92 | struct dentry *debug_dir; | 95 | struct dentry *debug_dir; |
93 | struct dentry *debug_stats; | 96 | struct dentry *debug_stats; |
@@ -103,9 +106,10 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); | |||
103 | void bdi_unregister(struct backing_dev_info *bdi); | 106 | void bdi_unregister(struct backing_dev_info *bdi); |
104 | int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); | 107 | int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); |
105 | void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, | 108 | void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, |
106 | long nr_pages); | 109 | long nr_pages, int sb_locked); |
107 | int bdi_writeback_task(struct bdi_writeback *wb); | 110 | int bdi_writeback_task(struct bdi_writeback *wb); |
108 | int bdi_has_dirty_io(struct backing_dev_info *bdi); | 111 | int bdi_has_dirty_io(struct backing_dev_info *bdi); |
112 | void bdi_arm_supers_timer(void); | ||
109 | 113 | ||
110 | extern spinlock_t bdi_lock; | 114 | extern spinlock_t bdi_lock; |
111 | extern struct list_head bdi_list; | 115 | extern struct list_head bdi_list; |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6690e8bae7bb..be411c12ebbe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -186,15 +186,19 @@ struct request { | |||
186 | }; | 186 | }; |
187 | 187 | ||
188 | /* | 188 | /* |
189 | * two pointers are available for the IO schedulers, if they need | 189 | * Three pointers are available for the IO schedulers, if they need |
190 | * more they have to dynamically allocate it. | 190 | * more they have to dynamically allocate it. |
191 | */ | 191 | */ |
192 | void *elevator_private; | 192 | void *elevator_private; |
193 | void *elevator_private2; | 193 | void *elevator_private2; |
194 | void *elevator_private3; | ||
194 | 195 | ||
195 | struct gendisk *rq_disk; | 196 | struct gendisk *rq_disk; |
196 | unsigned long start_time; | 197 | unsigned long start_time; |
197 | 198 | #ifdef CONFIG_BLK_CGROUP | |
199 | unsigned long long start_time_ns; | ||
200 | unsigned long long io_start_time_ns; /* when passed to hardware */ | ||
201 | #endif | ||
198 | /* Number of scatter-gather DMA addr+len pairs after | 202 | /* Number of scatter-gather DMA addr+len pairs after |
199 | * physical address coalescing is performed. | 203 | * physical address coalescing is performed. |
200 | */ | 204 | */ |
@@ -917,7 +921,12 @@ extern void blk_abort_queue(struct request_queue *); | |||
917 | */ | 921 | */ |
918 | extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn, | 922 | extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn, |
919 | spinlock_t *lock, int node_id); | 923 | spinlock_t *lock, int node_id); |
924 | extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *, | ||
925 | request_fn_proc *, | ||
926 | spinlock_t *, int node_id); | ||
920 | extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *); | 927 | extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *); |
928 | extern struct request_queue *blk_init_allocated_queue(struct request_queue *, | ||
929 | request_fn_proc *, spinlock_t *); | ||
921 | extern void blk_cleanup_queue(struct request_queue *); | 930 | extern void blk_cleanup_queue(struct request_queue *); |
922 | extern void blk_queue_make_request(struct request_queue *, make_request_fn *); | 931 | extern void blk_queue_make_request(struct request_queue *, make_request_fn *); |
923 | extern void blk_queue_bounce_limit(struct request_queue *, u64); | 932 | extern void blk_queue_bounce_limit(struct request_queue *, u64); |
@@ -994,20 +1003,25 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, | |||
994 | return NULL; | 1003 | return NULL; |
995 | return bqt->tag_index[tag]; | 1004 | return bqt->tag_index[tag]; |
996 | } | 1005 | } |
997 | 1006 | enum{ | |
998 | extern int blkdev_issue_flush(struct block_device *, sector_t *); | 1007 | BLKDEV_WAIT, /* wait for completion */ |
999 | #define DISCARD_FL_WAIT 0x01 /* wait for completion */ | 1008 | BLKDEV_BARRIER, /*issue request with barrier */ |
1000 | #define DISCARD_FL_BARRIER 0x02 /* issue DISCARD_BARRIER request */ | 1009 | }; |
1001 | extern int blkdev_issue_discard(struct block_device *, sector_t sector, | 1010 | #define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT) |
1002 | sector_t nr_sects, gfp_t, int flags); | 1011 | #define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER) |
1003 | 1012 | extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *, | |
1013 | unsigned long); | ||
1014 | extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | ||
1015 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); | ||
1016 | extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | ||
1017 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); | ||
1004 | static inline int sb_issue_discard(struct super_block *sb, | 1018 | static inline int sb_issue_discard(struct super_block *sb, |
1005 | sector_t block, sector_t nr_blocks) | 1019 | sector_t block, sector_t nr_blocks) |
1006 | { | 1020 | { |
1007 | block <<= (sb->s_blocksize_bits - 9); | 1021 | block <<= (sb->s_blocksize_bits - 9); |
1008 | nr_blocks <<= (sb->s_blocksize_bits - 9); | 1022 | nr_blocks <<= (sb->s_blocksize_bits - 9); |
1009 | return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL, | 1023 | return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL, |
1010 | DISCARD_FL_BARRIER); | 1024 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); |
1011 | } | 1025 | } |
1012 | 1026 | ||
1013 | extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); | 1027 | extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); |
@@ -1196,6 +1210,39 @@ static inline void put_dev_sector(Sector p) | |||
1196 | struct work_struct; | 1210 | struct work_struct; |
1197 | int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); | 1211 | int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); |
1198 | 1212 | ||
1213 | #ifdef CONFIG_BLK_CGROUP | ||
1214 | static inline void set_start_time_ns(struct request *req) | ||
1215 | { | ||
1216 | req->start_time_ns = sched_clock(); | ||
1217 | } | ||
1218 | |||
1219 | static inline void set_io_start_time_ns(struct request *req) | ||
1220 | { | ||
1221 | req->io_start_time_ns = sched_clock(); | ||
1222 | } | ||
1223 | |||
1224 | static inline uint64_t rq_start_time_ns(struct request *req) | ||
1225 | { | ||
1226 | return req->start_time_ns; | ||
1227 | } | ||
1228 | |||
1229 | static inline uint64_t rq_io_start_time_ns(struct request *req) | ||
1230 | { | ||
1231 | return req->io_start_time_ns; | ||
1232 | } | ||
1233 | #else | ||
1234 | static inline void set_start_time_ns(struct request *req) {} | ||
1235 | static inline void set_io_start_time_ns(struct request *req) {} | ||
1236 | static inline uint64_t rq_start_time_ns(struct request *req) | ||
1237 | { | ||
1238 | return 0; | ||
1239 | } | ||
1240 | static inline uint64_t rq_io_start_time_ns(struct request *req) | ||
1241 | { | ||
1242 | return 0; | ||
1243 | } | ||
1244 | #endif | ||
1245 | |||
1199 | #define MODULE_ALIAS_BLOCKDEV(major,minor) \ | 1246 | #define MODULE_ALIAS_BLOCKDEV(major,minor) \ |
1200 | MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) | 1247 | MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) |
1201 | #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ | 1248 | #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ |
@@ -1283,8 +1330,7 @@ struct block_device_operations { | |||
1283 | int (*direct_access) (struct block_device *, sector_t, | 1330 | int (*direct_access) (struct block_device *, sector_t, |
1284 | void **, unsigned long *); | 1331 | void **, unsigned long *); |
1285 | int (*media_changed) (struct gendisk *); | 1332 | int (*media_changed) (struct gendisk *); |
1286 | unsigned long long (*set_capacity) (struct gendisk *, | 1333 | void (*unlock_native_capacity) (struct gendisk *); |
1287 | unsigned long long); | ||
1288 | int (*revalidate_disk) (struct gendisk *); | 1334 | int (*revalidate_disk) (struct gendisk *); |
1289 | int (*getgeo)(struct block_device *, struct hd_geometry *); | 1335 | int (*getgeo)(struct block_device *, struct hd_geometry *); |
1290 | struct module *owner; | 1336 | struct module *owner; |
diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 4341b1a97a34..68530521ad00 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h | |||
@@ -53,10 +53,10 @@ | |||
53 | 53 | ||
54 | 54 | ||
55 | extern const char *drbd_buildtag(void); | 55 | extern const char *drbd_buildtag(void); |
56 | #define REL_VERSION "8.3.7" | 56 | #define REL_VERSION "8.3.8rc1" |
57 | #define API_VERSION 88 | 57 | #define API_VERSION 88 |
58 | #define PRO_VERSION_MIN 86 | 58 | #define PRO_VERSION_MIN 86 |
59 | #define PRO_VERSION_MAX 92 | 59 | #define PRO_VERSION_MAX 94 |
60 | 60 | ||
61 | 61 | ||
62 | enum drbd_io_error_p { | 62 | enum drbd_io_error_p { |
@@ -139,6 +139,7 @@ enum drbd_ret_codes { | |||
139 | ERR_DATA_NOT_CURRENT = 150, | 139 | ERR_DATA_NOT_CURRENT = 150, |
140 | ERR_CONNECTED = 151, /* DRBD 8.3 only */ | 140 | ERR_CONNECTED = 151, /* DRBD 8.3 only */ |
141 | ERR_PERM = 152, | 141 | ERR_PERM = 152, |
142 | ERR_NEED_APV_93 = 153, | ||
142 | 143 | ||
143 | /* insert new ones above this line */ | 144 | /* insert new ones above this line */ |
144 | AFTER_LAST_ERR_CODE | 145 | AFTER_LAST_ERR_CODE |
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 51f47a586ad8..440b42e38e89 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h | |||
@@ -133,5 +133,21 @@ | |||
133 | #define DRBD_MAX_BIO_BVECS_MAX 128 | 133 | #define DRBD_MAX_BIO_BVECS_MAX 128 |
134 | #define DRBD_MAX_BIO_BVECS_DEF 0 | 134 | #define DRBD_MAX_BIO_BVECS_DEF 0 |
135 | 135 | ||
136 | #define DRBD_DP_VOLUME_MIN 4 | ||
137 | #define DRBD_DP_VOLUME_MAX 1048576 | ||
138 | #define DRBD_DP_VOLUME_DEF 16384 | ||
139 | |||
140 | #define DRBD_DP_INTERVAL_MIN 1 | ||
141 | #define DRBD_DP_INTERVAL_MAX 600 | ||
142 | #define DRBD_DP_INTERVAL_DEF 5 | ||
143 | |||
144 | #define DRBD_RS_THROTTLE_TH_MIN 1 | ||
145 | #define DRBD_RS_THROTTLE_TH_MAX 600 | ||
146 | #define DRBD_RS_THROTTLE_TH_DEF 20 | ||
147 | |||
148 | #define DRBD_RS_HOLD_OFF_TH_MIN 1 | ||
149 | #define DRBD_RS_HOLD_OFF_TH_MAX 6000 | ||
150 | #define DRBD_RS_HOLD_OFF_TH_DEF 100 | ||
151 | |||
136 | #undef RANGE | 152 | #undef RANGE |
137 | #endif | 153 | #endif |
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index f7431a4ca608..ce77a746fc9d 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h | |||
@@ -71,12 +71,17 @@ NL_PACKET(disconnect, 6, ) | |||
71 | NL_PACKET(resize, 7, | 71 | NL_PACKET(resize, 7, |
72 | NL_INT64( 29, T_MAY_IGNORE, resize_size) | 72 | NL_INT64( 29, T_MAY_IGNORE, resize_size) |
73 | NL_BIT( 68, T_MAY_IGNORE, resize_force) | 73 | NL_BIT( 68, T_MAY_IGNORE, resize_force) |
74 | NL_BIT( 69, T_MANDATORY, no_resync) | ||
74 | ) | 75 | ) |
75 | 76 | ||
76 | NL_PACKET(syncer_conf, 8, | 77 | NL_PACKET(syncer_conf, 8, |
77 | NL_INTEGER( 30, T_MAY_IGNORE, rate) | 78 | NL_INTEGER( 30, T_MAY_IGNORE, rate) |
78 | NL_INTEGER( 31, T_MAY_IGNORE, after) | 79 | NL_INTEGER( 31, T_MAY_IGNORE, after) |
79 | NL_INTEGER( 32, T_MAY_IGNORE, al_extents) | 80 | NL_INTEGER( 32, T_MAY_IGNORE, al_extents) |
81 | NL_INTEGER( 71, T_MAY_IGNORE, dp_volume) | ||
82 | NL_INTEGER( 72, T_MAY_IGNORE, dp_interval) | ||
83 | NL_INTEGER( 73, T_MAY_IGNORE, throttle_th) | ||
84 | NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th) | ||
80 | NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) | 85 | NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) |
81 | NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) | 86 | NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) |
82 | NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) | 87 | NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) |
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1cb3372e65d8..2c958f4fce1e 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h | |||
@@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int | |||
14 | 14 | ||
15 | typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *); | 15 | typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *); |
16 | 16 | ||
17 | typedef void (elevator_bio_merged_fn) (struct request_queue *, | ||
18 | struct request *, struct bio *); | ||
19 | |||
17 | typedef int (elevator_dispatch_fn) (struct request_queue *, int); | 20 | typedef int (elevator_dispatch_fn) (struct request_queue *, int); |
18 | 21 | ||
19 | typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); | 22 | typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); |
@@ -36,6 +39,7 @@ struct elevator_ops | |||
36 | elevator_merged_fn *elevator_merged_fn; | 39 | elevator_merged_fn *elevator_merged_fn; |
37 | elevator_merge_req_fn *elevator_merge_req_fn; | 40 | elevator_merge_req_fn *elevator_merge_req_fn; |
38 | elevator_allow_merge_fn *elevator_allow_merge_fn; | 41 | elevator_allow_merge_fn *elevator_allow_merge_fn; |
42 | elevator_bio_merged_fn *elevator_bio_merged_fn; | ||
39 | 43 | ||
40 | elevator_dispatch_fn *elevator_dispatch_fn; | 44 | elevator_dispatch_fn *elevator_dispatch_fn; |
41 | elevator_add_req_fn *elevator_add_req_fn; | 45 | elevator_add_req_fn *elevator_add_req_fn; |
@@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *); | |||
103 | extern void elv_merge_requests(struct request_queue *, struct request *, | 107 | extern void elv_merge_requests(struct request_queue *, struct request *, |
104 | struct request *); | 108 | struct request *); |
105 | extern void elv_merged_request(struct request_queue *, struct request *, int); | 109 | extern void elv_merged_request(struct request_queue *, struct request *, int); |
110 | extern void elv_bio_merged(struct request_queue *q, struct request *, | ||
111 | struct bio *); | ||
106 | extern void elv_requeue_request(struct request_queue *, struct request *); | 112 | extern void elv_requeue_request(struct request_queue *, struct request *); |
107 | extern int elv_queue_empty(struct request_queue *); | 113 | extern int elv_queue_empty(struct request_queue *); |
108 | extern struct request *elv_former_request(struct request_queue *, struct request *); | 114 | extern struct request *elv_former_request(struct request_queue *, struct request *); |
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 86037400a6e3..afc00af3229b 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h | |||
@@ -22,6 +22,12 @@ | |||
22 | #define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) | 22 | #define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * Set and get of pipe page size array | ||
26 | */ | ||
27 | #define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) | ||
28 | #define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) | ||
29 | |||
30 | /* | ||
25 | * Types of directory notifications that may be requested. | 31 | * Types of directory notifications that may be requested. |
26 | */ | 32 | */ |
27 | #define DN_ACCESS 0x00000001 /* File accessed */ | 33 | #define DN_ACCESS 0x00000001 /* File accessed */ |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 4079ef99900f..1775d362732d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -651,6 +651,7 @@ struct block_device { | |||
651 | int bd_openers; | 651 | int bd_openers; |
652 | struct mutex bd_mutex; /* open/close mutex */ | 652 | struct mutex bd_mutex; /* open/close mutex */ |
653 | struct list_head bd_inodes; | 653 | struct list_head bd_inodes; |
654 | void * bd_claiming; | ||
654 | void * bd_holder; | 655 | void * bd_holder; |
655 | int bd_holders; | 656 | int bd_holders; |
656 | #ifdef CONFIG_SYSFS | 657 | #ifdef CONFIG_SYSFS |
diff --git a/include/linux/ide.h b/include/linux/ide.h index 3239d1c10acb..b6d448048ae2 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h | |||
@@ -362,7 +362,7 @@ struct ide_drive_s; | |||
362 | struct ide_disk_ops { | 362 | struct ide_disk_ops { |
363 | int (*check)(struct ide_drive_s *, const char *); | 363 | int (*check)(struct ide_drive_s *, const char *); |
364 | int (*get_capacity)(struct ide_drive_s *); | 364 | int (*get_capacity)(struct ide_drive_s *); |
365 | u64 (*set_capacity)(struct ide_drive_s *, u64); | 365 | void (*unlock_native_capacity)(struct ide_drive_s *); |
366 | void (*setup)(struct ide_drive_s *); | 366 | void (*setup)(struct ide_drive_s *); |
367 | void (*flush)(struct ide_drive_s *); | 367 | void (*flush)(struct ide_drive_s *); |
368 | int (*init_media)(struct ide_drive_s *, struct gendisk *); | 368 | int (*init_media)(struct ide_drive_s *, struct gendisk *); |
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index b43a9e039059..16de3933c45e 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | #define PIPEFS_MAGIC 0x50495045 | 4 | #define PIPEFS_MAGIC 0x50495045 |
5 | 5 | ||
6 | #define PIPE_BUFFERS (16) | 6 | #define PIPE_DEF_BUFFERS 16 |
7 | 7 | ||
8 | #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ | 8 | #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ |
9 | #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ | 9 | #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ |
@@ -44,17 +44,17 @@ struct pipe_buffer { | |||
44 | **/ | 44 | **/ |
45 | struct pipe_inode_info { | 45 | struct pipe_inode_info { |
46 | wait_queue_head_t wait; | 46 | wait_queue_head_t wait; |
47 | unsigned int nrbufs, curbuf; | 47 | unsigned int nrbufs, curbuf, buffers; |
48 | struct page *tmp_page; | ||
49 | unsigned int readers; | 48 | unsigned int readers; |
50 | unsigned int writers; | 49 | unsigned int writers; |
51 | unsigned int waiting_writers; | 50 | unsigned int waiting_writers; |
52 | unsigned int r_counter; | 51 | unsigned int r_counter; |
53 | unsigned int w_counter; | 52 | unsigned int w_counter; |
53 | struct page *tmp_page; | ||
54 | struct fasync_struct *fasync_readers; | 54 | struct fasync_struct *fasync_readers; |
55 | struct fasync_struct *fasync_writers; | 55 | struct fasync_struct *fasync_writers; |
56 | struct inode *inode; | 56 | struct inode *inode; |
57 | struct pipe_buffer bufs[PIPE_BUFFERS]; | 57 | struct pipe_buffer *bufs; |
58 | }; | 58 | }; |
59 | 59 | ||
60 | /* | 60 | /* |
@@ -139,6 +139,8 @@ void pipe_lock(struct pipe_inode_info *); | |||
139 | void pipe_unlock(struct pipe_inode_info *); | 139 | void pipe_unlock(struct pipe_inode_info *); |
140 | void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); | 140 | void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); |
141 | 141 | ||
142 | extern unsigned int pipe_max_pages; | ||
143 | |||
142 | /* Drop the inode semaphore and wait for a pipe event, atomically */ | 144 | /* Drop the inode semaphore and wait for a pipe event, atomically */ |
143 | void pipe_wait(struct pipe_inode_info *pipe); | 145 | void pipe_wait(struct pipe_inode_info *pipe); |
144 | 146 | ||
@@ -154,4 +156,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); | |||
154 | int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); | 156 | int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); |
155 | void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); | 157 | void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); |
156 | 158 | ||
159 | /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ | ||
160 | long pipe_fcntl(struct file *, unsigned int, unsigned long arg); | ||
161 | |||
157 | #endif | 162 | #endif |
diff --git a/include/linux/splice.h b/include/linux/splice.h index 18e7c7c0cae6..997c3b4c212b 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h | |||
@@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *, | |||
82 | extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, | 82 | extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, |
83 | splice_direct_actor *); | 83 | splice_direct_actor *); |
84 | 84 | ||
85 | /* | ||
86 | * for dynamic pipe sizing | ||
87 | */ | ||
88 | extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); | ||
89 | extern void splice_shrink_spd(struct pipe_inode_info *, | ||
90 | struct splice_pipe_desc *); | ||
91 | |||
85 | #endif | 92 | #endif |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 36520ded3e06..cc97d6caf2b3 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -65,6 +65,15 @@ struct writeback_control { | |||
65 | * so we use a single control to update them | 65 | * so we use a single control to update them |
66 | */ | 66 | */ |
67 | unsigned no_nrwrite_index_update:1; | 67 | unsigned no_nrwrite_index_update:1; |
68 | |||
69 | /* | ||
70 | * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE, | ||
71 | * the writeback code will pin the sb for the caller. However, | ||
72 | * for eg umount, the caller does WB_SYNC_NONE but already has | ||
73 | * the sb pinned. If the below is set, caller already has the | ||
74 | * sb pinned. | ||
75 | */ | ||
76 | unsigned sb_pinned:1; | ||
68 | }; | 77 | }; |
69 | 78 | ||
70 | /* | 79 | /* |
@@ -73,6 +82,7 @@ struct writeback_control { | |||
73 | struct bdi_writeback; | 82 | struct bdi_writeback; |
74 | int inode_wait(void *); | 83 | int inode_wait(void *); |
75 | void writeback_inodes_sb(struct super_block *); | 84 | void writeback_inodes_sb(struct super_block *); |
85 | void writeback_inodes_sb_locked(struct super_block *); | ||
76 | int writeback_inodes_sb_if_idle(struct super_block *); | 86 | int writeback_inodes_sb_if_idle(struct super_block *); |
77 | void sync_inodes_sb(struct super_block *); | 87 | void sync_inodes_sb(struct super_block *); |
78 | void writeback_inodes_wbc(struct writeback_control *wbc); | 88 | void writeback_inodes_wbc(struct writeback_control *wbc); |
@@ -96,8 +106,14 @@ static inline void inode_sync_wait(struct inode *inode) | |||
96 | /* | 106 | /* |
97 | * mm/page-writeback.c | 107 | * mm/page-writeback.c |
98 | */ | 108 | */ |
99 | void laptop_io_completion(void); | 109 | #ifdef CONFIG_BLOCK |
110 | void laptop_io_completion(struct backing_dev_info *info); | ||
100 | void laptop_sync_completion(void); | 111 | void laptop_sync_completion(void); |
112 | void laptop_mode_sync(struct work_struct *work); | ||
113 | void laptop_mode_timer_fn(unsigned long data); | ||
114 | #else | ||
115 | static inline void laptop_sync_completion(void) { } | ||
116 | #endif | ||
101 | void throttle_vm_writeout(gfp_t gfp_mask); | 117 | void throttle_vm_writeout(gfp_t gfp_mask); |
102 | 118 | ||
103 | /* These are exported to sysctl. */ | 119 | /* These are exported to sysctl. */ |
diff --git a/init/Kconfig b/init/Kconfig index 5fe94b82e4c0..2cce9f343ad0 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -611,6 +611,33 @@ config RT_GROUP_SCHED | |||
611 | 611 | ||
612 | endif #CGROUP_SCHED | 612 | endif #CGROUP_SCHED |
613 | 613 | ||
614 | config BLK_CGROUP | ||
615 | tristate "Block IO controller" | ||
616 | depends on CGROUPS && BLOCK | ||
617 | default n | ||
618 | ---help--- | ||
619 | Generic block IO controller cgroup interface. This is the common | ||
620 | cgroup interface which should be used by various IO controlling | ||
621 | policies. | ||
622 | |||
623 | Currently, CFQ IO scheduler uses it to recognize task groups and | ||
624 | control disk bandwidth allocation (proportional time slice allocation) | ||
625 | to such task groups. | ||
626 | |||
627 | This option only enables generic Block IO controller infrastructure. | ||
628 | One needs to also enable actual IO controlling logic in CFQ for it | ||
629 | to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y). | ||
630 | |||
631 | See Documentation/cgroups/blkio-controller.txt for more information. | ||
632 | |||
633 | config DEBUG_BLK_CGROUP | ||
634 | bool "Enable Block IO controller debugging" | ||
635 | depends on BLK_CGROUP | ||
636 | default n | ||
637 | ---help--- | ||
638 | Enable some debugging help. Currently it exports additional stat | ||
639 | files in a cgroup which can be useful for debugging. | ||
640 | |||
614 | endif # CGROUPS | 641 | endif # CGROUPS |
615 | 642 | ||
616 | config MM_OWNER | 643 | config MM_OWNER |
diff --git a/kernel/relay.c b/kernel/relay.c index 3d97f2821611..4268287148c1 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1231 | size_t read_subbuf = read_start / subbuf_size; | 1231 | size_t read_subbuf = read_start / subbuf_size; |
1232 | size_t padding = rbuf->padding[read_subbuf]; | 1232 | size_t padding = rbuf->padding[read_subbuf]; |
1233 | size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; | 1233 | size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; |
1234 | struct page *pages[PIPE_BUFFERS]; | 1234 | struct page *pages[PIPE_DEF_BUFFERS]; |
1235 | struct partial_page partial[PIPE_BUFFERS]; | 1235 | struct partial_page partial[PIPE_DEF_BUFFERS]; |
1236 | struct splice_pipe_desc spd = { | 1236 | struct splice_pipe_desc spd = { |
1237 | .pages = pages, | 1237 | .pages = pages, |
1238 | .nr_pages = 0, | 1238 | .nr_pages = 0, |
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1245 | 1245 | ||
1246 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) | 1246 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) |
1247 | return 0; | 1247 | return 0; |
1248 | if (splice_grow_spd(pipe, &spd)) | ||
1249 | return -ENOMEM; | ||
1248 | 1250 | ||
1249 | /* | 1251 | /* |
1250 | * Adjust read len, if longer than what is available | 1252 | * Adjust read len, if longer than what is available |
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1255 | subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; | 1257 | subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; |
1256 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; | 1258 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; |
1257 | poff = read_start & ~PAGE_MASK; | 1259 | poff = read_start & ~PAGE_MASK; |
1258 | nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); | 1260 | nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); |
1259 | 1261 | ||
1260 | for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { | 1262 | for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { |
1261 | unsigned int this_len, this_end, private; | 1263 | unsigned int this_len, this_end, private; |
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1289 | } | 1291 | } |
1290 | } | 1292 | } |
1291 | 1293 | ||
1294 | ret = 0; | ||
1292 | if (!spd.nr_pages) | 1295 | if (!spd.nr_pages) |
1293 | return 0; | 1296 | goto out; |
1294 | 1297 | ||
1295 | ret = *nonpad_ret = splice_to_pipe(pipe, &spd); | 1298 | ret = *nonpad_ret = splice_to_pipe(pipe, &spd); |
1296 | if (ret < 0 || ret < total_len) | 1299 | if (ret < 0 || ret < total_len) |
1297 | return ret; | 1300 | goto out; |
1298 | 1301 | ||
1299 | if (read_start + ret == nonpad_end) | 1302 | if (read_start + ret == nonpad_end) |
1300 | ret += padding; | 1303 | ret += padding; |
1301 | 1304 | ||
1305 | out: | ||
1306 | splice_shrink_spd(pipe, &spd); | ||
1302 | return ret; | 1307 | return ret; |
1303 | } | 1308 | } |
1304 | 1309 | ||
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 5b496132c28a..906a0f718cb3 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
41 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) | 41 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) |
42 | * (NSEC_PER_SEC / HZ); | 42 | * (NSEC_PER_SEC / HZ); |
43 | } | 43 | } |
44 | EXPORT_SYMBOL_GPL(sched_clock); | ||
44 | 45 | ||
45 | static __read_mostly int sched_clock_running; | 46 | static __read_mostly int sched_clock_running; |
46 | 47 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b12583047757..18821e77b2a0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <linux/slow-work.h> | 52 | #include <linux/slow-work.h> |
53 | #include <linux/perf_event.h> | 53 | #include <linux/perf_event.h> |
54 | #include <linux/kprobes.h> | 54 | #include <linux/kprobes.h> |
55 | #include <linux/pipe_fs_i.h> | ||
55 | 56 | ||
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
57 | #include <asm/processor.h> | 58 | #include <asm/processor.h> |
@@ -1444,6 +1445,14 @@ static struct ctl_table fs_table[] = { | |||
1444 | .child = binfmt_misc_table, | 1445 | .child = binfmt_misc_table, |
1445 | }, | 1446 | }, |
1446 | #endif | 1447 | #endif |
1448 | { | ||
1449 | .procname = "pipe-max-pages", | ||
1450 | .data = &pipe_max_pages, | ||
1451 | .maxlen = sizeof(int), | ||
1452 | .mode = 0644, | ||
1453 | .proc_handler = &proc_dointvec_minmax, | ||
1454 | .extra1 = &two, | ||
1455 | }, | ||
1447 | /* | 1456 | /* |
1448 | * NOTE: do not add new entries to this table unless you have read | 1457 | * NOTE: do not add new entries to this table unless you have read |
1449 | * Documentation/sysctl/ctl_unnumbered.txt | 1458 | * Documentation/sysctl/ctl_unnumbered.txt |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 756d7283318b..8a76339a9e65 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -3309,12 +3309,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3309 | size_t len, | 3309 | size_t len, |
3310 | unsigned int flags) | 3310 | unsigned int flags) |
3311 | { | 3311 | { |
3312 | struct page *pages[PIPE_BUFFERS]; | 3312 | struct page *pages_def[PIPE_DEF_BUFFERS]; |
3313 | struct partial_page partial[PIPE_BUFFERS]; | 3313 | struct partial_page partial_def[PIPE_DEF_BUFFERS]; |
3314 | struct trace_iterator *iter = filp->private_data; | 3314 | struct trace_iterator *iter = filp->private_data; |
3315 | struct splice_pipe_desc spd = { | 3315 | struct splice_pipe_desc spd = { |
3316 | .pages = pages, | 3316 | .pages = pages_def, |
3317 | .partial = partial, | 3317 | .partial = partial_def, |
3318 | .nr_pages = 0, /* This gets updated below. */ | 3318 | .nr_pages = 0, /* This gets updated below. */ |
3319 | .flags = flags, | 3319 | .flags = flags, |
3320 | .ops = &tracing_pipe_buf_ops, | 3320 | .ops = &tracing_pipe_buf_ops, |
@@ -3325,6 +3325,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3325 | size_t rem; | 3325 | size_t rem; |
3326 | unsigned int i; | 3326 | unsigned int i; |
3327 | 3327 | ||
3328 | if (splice_grow_spd(pipe, &spd)) | ||
3329 | return -ENOMEM; | ||
3330 | |||
3328 | /* copy the tracer to avoid using a global lock all around */ | 3331 | /* copy the tracer to avoid using a global lock all around */ |
3329 | mutex_lock(&trace_types_lock); | 3332 | mutex_lock(&trace_types_lock); |
3330 | if (unlikely(old_tracer != current_trace && current_trace)) { | 3333 | if (unlikely(old_tracer != current_trace && current_trace)) { |
@@ -3355,23 +3358,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3355 | trace_access_lock(iter->cpu_file); | 3358 | trace_access_lock(iter->cpu_file); |
3356 | 3359 | ||
3357 | /* Fill as many pages as possible. */ | 3360 | /* Fill as many pages as possible. */ |
3358 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { | 3361 | for (i = 0, rem = len; i < pipe->buffers && rem; i++) { |
3359 | pages[i] = alloc_page(GFP_KERNEL); | 3362 | spd.pages[i] = alloc_page(GFP_KERNEL); |
3360 | if (!pages[i]) | 3363 | if (!spd.pages[i]) |
3361 | break; | 3364 | break; |
3362 | 3365 | ||
3363 | rem = tracing_fill_pipe_page(rem, iter); | 3366 | rem = tracing_fill_pipe_page(rem, iter); |
3364 | 3367 | ||
3365 | /* Copy the data into the page, so we can start over. */ | 3368 | /* Copy the data into the page, so we can start over. */ |
3366 | ret = trace_seq_to_buffer(&iter->seq, | 3369 | ret = trace_seq_to_buffer(&iter->seq, |
3367 | page_address(pages[i]), | 3370 | page_address(spd.pages[i]), |
3368 | iter->seq.len); | 3371 | iter->seq.len); |
3369 | if (ret < 0) { | 3372 | if (ret < 0) { |
3370 | __free_page(pages[i]); | 3373 | __free_page(spd.pages[i]); |
3371 | break; | 3374 | break; |
3372 | } | 3375 | } |
3373 | partial[i].offset = 0; | 3376 | spd.partial[i].offset = 0; |
3374 | partial[i].len = iter->seq.len; | 3377 | spd.partial[i].len = iter->seq.len; |
3375 | 3378 | ||
3376 | trace_seq_init(&iter->seq); | 3379 | trace_seq_init(&iter->seq); |
3377 | } | 3380 | } |
@@ -3382,12 +3385,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3382 | 3385 | ||
3383 | spd.nr_pages = i; | 3386 | spd.nr_pages = i; |
3384 | 3387 | ||
3385 | return splice_to_pipe(pipe, &spd); | 3388 | ret = splice_to_pipe(pipe, &spd); |
3389 | out: | ||
3390 | splice_shrink_spd(pipe, &spd); | ||
3391 | return ret; | ||
3386 | 3392 | ||
3387 | out_err: | 3393 | out_err: |
3388 | mutex_unlock(&iter->mutex); | 3394 | mutex_unlock(&iter->mutex); |
3389 | 3395 | goto out; | |
3390 | return ret; | ||
3391 | } | 3396 | } |
3392 | 3397 | ||
3393 | static ssize_t | 3398 | static ssize_t |
@@ -3786,11 +3791,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3786 | unsigned int flags) | 3791 | unsigned int flags) |
3787 | { | 3792 | { |
3788 | struct ftrace_buffer_info *info = file->private_data; | 3793 | struct ftrace_buffer_info *info = file->private_data; |
3789 | struct partial_page partial[PIPE_BUFFERS]; | 3794 | struct partial_page partial_def[PIPE_DEF_BUFFERS]; |
3790 | struct page *pages[PIPE_BUFFERS]; | 3795 | struct page *pages_def[PIPE_DEF_BUFFERS]; |
3791 | struct splice_pipe_desc spd = { | 3796 | struct splice_pipe_desc spd = { |
3792 | .pages = pages, | 3797 | .pages = pages_def, |
3793 | .partial = partial, | 3798 | .partial = partial_def, |
3794 | .flags = flags, | 3799 | .flags = flags, |
3795 | .ops = &buffer_pipe_buf_ops, | 3800 | .ops = &buffer_pipe_buf_ops, |
3796 | .spd_release = buffer_spd_release, | 3801 | .spd_release = buffer_spd_release, |
@@ -3799,22 +3804,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3799 | int entries, size, i; | 3804 | int entries, size, i; |
3800 | size_t ret; | 3805 | size_t ret; |
3801 | 3806 | ||
3807 | if (splice_grow_spd(pipe, &spd)) | ||
3808 | return -ENOMEM; | ||
3809 | |||
3802 | if (*ppos & (PAGE_SIZE - 1)) { | 3810 | if (*ppos & (PAGE_SIZE - 1)) { |
3803 | WARN_ONCE(1, "Ftrace: previous read must page-align\n"); | 3811 | WARN_ONCE(1, "Ftrace: previous read must page-align\n"); |
3804 | return -EINVAL; | 3812 | ret = -EINVAL; |
3813 | goto out; | ||
3805 | } | 3814 | } |
3806 | 3815 | ||
3807 | if (len & (PAGE_SIZE - 1)) { | 3816 | if (len & (PAGE_SIZE - 1)) { |
3808 | WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); | 3817 | WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); |
3809 | if (len < PAGE_SIZE) | 3818 | if (len < PAGE_SIZE) { |
3810 | return -EINVAL; | 3819 | ret = -EINVAL; |
3820 | goto out; | ||
3821 | } | ||
3811 | len &= PAGE_MASK; | 3822 | len &= PAGE_MASK; |
3812 | } | 3823 | } |
3813 | 3824 | ||
3814 | trace_access_lock(info->cpu); | 3825 | trace_access_lock(info->cpu); |
3815 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3826 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
3816 | 3827 | ||
3817 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { | 3828 | for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { |
3818 | struct page *page; | 3829 | struct page *page; |
3819 | int r; | 3830 | int r; |
3820 | 3831 | ||
@@ -3869,11 +3880,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3869 | else | 3880 | else |
3870 | ret = 0; | 3881 | ret = 0; |
3871 | /* TODO: block */ | 3882 | /* TODO: block */ |
3872 | return ret; | 3883 | goto out; |
3873 | } | 3884 | } |
3874 | 3885 | ||
3875 | ret = splice_to_pipe(pipe, &spd); | 3886 | ret = splice_to_pipe(pipe, &spd); |
3876 | 3887 | splice_shrink_spd(pipe, &spd); | |
3888 | out: | ||
3877 | return ret; | 3889 | return ret; |
3878 | } | 3890 | } |
3879 | 3891 | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 707d0dc6da0f..660a87a22511 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -48,7 +48,6 @@ static struct timer_list sync_supers_timer; | |||
48 | 48 | ||
49 | static int bdi_sync_supers(void *); | 49 | static int bdi_sync_supers(void *); |
50 | static void sync_supers_timer_fn(unsigned long); | 50 | static void sync_supers_timer_fn(unsigned long); |
51 | static void arm_supers_timer(void); | ||
52 | 51 | ||
53 | static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); | 52 | static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); |
54 | 53 | ||
@@ -252,7 +251,7 @@ static int __init default_bdi_init(void) | |||
252 | 251 | ||
253 | init_timer(&sync_supers_timer); | 252 | init_timer(&sync_supers_timer); |
254 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); | 253 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); |
255 | arm_supers_timer(); | 254 | bdi_arm_supers_timer(); |
256 | 255 | ||
257 | err = bdi_init(&default_backing_dev_info); | 256 | err = bdi_init(&default_backing_dev_info); |
258 | if (!err) | 257 | if (!err) |
@@ -374,10 +373,13 @@ static int bdi_sync_supers(void *unused) | |||
374 | return 0; | 373 | return 0; |
375 | } | 374 | } |
376 | 375 | ||
377 | static void arm_supers_timer(void) | 376 | void bdi_arm_supers_timer(void) |
378 | { | 377 | { |
379 | unsigned long next; | 378 | unsigned long next; |
380 | 379 | ||
380 | if (!dirty_writeback_interval) | ||
381 | return; | ||
382 | |||
381 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; | 383 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; |
382 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); | 384 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); |
383 | } | 385 | } |
@@ -385,7 +387,7 @@ static void arm_supers_timer(void) | |||
385 | static void sync_supers_timer_fn(unsigned long unused) | 387 | static void sync_supers_timer_fn(unsigned long unused) |
386 | { | 388 | { |
387 | wake_up_process(sync_supers_tsk); | 389 | wake_up_process(sync_supers_tsk); |
388 | arm_supers_timer(); | 390 | bdi_arm_supers_timer(); |
389 | } | 391 | } |
390 | 392 | ||
391 | static int bdi_forker_task(void *ptr) | 393 | static int bdi_forker_task(void *ptr) |
@@ -428,7 +430,10 @@ static int bdi_forker_task(void *ptr) | |||
428 | 430 | ||
429 | spin_unlock_bh(&bdi_lock); | 431 | spin_unlock_bh(&bdi_lock); |
430 | wait = msecs_to_jiffies(dirty_writeback_interval * 10); | 432 | wait = msecs_to_jiffies(dirty_writeback_interval * 10); |
431 | schedule_timeout(wait); | 433 | if (wait) |
434 | schedule_timeout(wait); | ||
435 | else | ||
436 | schedule(); | ||
432 | try_to_freeze(); | 437 | try_to_freeze(); |
433 | continue; | 438 | continue; |
434 | } | 439 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943ecf8b..b289310e2c89 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
597 | (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) | 597 | (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) |
598 | + global_page_state(NR_UNSTABLE_NFS)) | 598 | + global_page_state(NR_UNSTABLE_NFS)) |
599 | > background_thresh))) | 599 | > background_thresh))) |
600 | bdi_start_writeback(bdi, NULL, 0); | 600 | bdi_start_writeback(bdi, NULL, 0, 0); |
601 | } | 601 | } |
602 | 602 | ||
603 | void set_page_dirty_balance(struct page *page, int page_mkwrite) | 603 | void set_page_dirty_balance(struct page *page, int page_mkwrite) |
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
683 | } | 683 | } |
684 | } | 684 | } |
685 | 685 | ||
686 | static void laptop_timer_fn(unsigned long unused); | ||
687 | |||
688 | static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); | ||
689 | |||
690 | /* | 686 | /* |
691 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 687 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
692 | */ | 688 | */ |
@@ -694,24 +690,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, | |||
694 | void __user *buffer, size_t *length, loff_t *ppos) | 690 | void __user *buffer, size_t *length, loff_t *ppos) |
695 | { | 691 | { |
696 | proc_dointvec(table, write, buffer, length, ppos); | 692 | proc_dointvec(table, write, buffer, length, ppos); |
693 | bdi_arm_supers_timer(); | ||
697 | return 0; | 694 | return 0; |
698 | } | 695 | } |
699 | 696 | ||
700 | static void do_laptop_sync(struct work_struct *work) | 697 | #ifdef CONFIG_BLOCK |
698 | void laptop_mode_timer_fn(unsigned long data) | ||
701 | { | 699 | { |
702 | wakeup_flusher_threads(0); | 700 | struct request_queue *q = (struct request_queue *)data; |
703 | kfree(work); | 701 | int nr_pages = global_page_state(NR_FILE_DIRTY) + |
704 | } | 702 | global_page_state(NR_UNSTABLE_NFS); |
705 | 703 | ||
706 | static void laptop_timer_fn(unsigned long unused) | 704 | /* |
707 | { | 705 | * We want to write everything out, not just down to the dirty |
708 | struct work_struct *work; | 706 | * threshold |
707 | */ | ||
709 | 708 | ||
710 | work = kmalloc(sizeof(*work), GFP_ATOMIC); | 709 | if (bdi_has_dirty_io(&q->backing_dev_info)) |
711 | if (work) { | 710 | bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0); |
712 | INIT_WORK(work, do_laptop_sync); | ||
713 | schedule_work(work); | ||
714 | } | ||
715 | } | 711 | } |
716 | 712 | ||
717 | /* | 713 | /* |
@@ -719,9 +715,9 @@ static void laptop_timer_fn(unsigned long unused) | |||
719 | * of all dirty data a few seconds from now. If the flush is already scheduled | 715 | * of all dirty data a few seconds from now. If the flush is already scheduled |
720 | * then push it back - the user is still using the disk. | 716 | * then push it back - the user is still using the disk. |
721 | */ | 717 | */ |
722 | void laptop_io_completion(void) | 718 | void laptop_io_completion(struct backing_dev_info *info) |
723 | { | 719 | { |
724 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); | 720 | mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); |
725 | } | 721 | } |
726 | 722 | ||
727 | /* | 723 | /* |
@@ -731,8 +727,16 @@ void laptop_io_completion(void) | |||
731 | */ | 727 | */ |
732 | void laptop_sync_completion(void) | 728 | void laptop_sync_completion(void) |
733 | { | 729 | { |
734 | del_timer(&laptop_mode_wb_timer); | 730 | struct backing_dev_info *bdi; |
731 | |||
732 | rcu_read_lock(); | ||
733 | |||
734 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) | ||
735 | del_timer(&bdi->laptop_mode_wb_timer); | ||
736 | |||
737 | rcu_read_unlock(); | ||
735 | } | 738 | } |
739 | #endif | ||
736 | 740 | ||
737 | /* | 741 | /* |
738 | * If ratelimit_pages is too high then we can get into dirty-data overload | 742 | * If ratelimit_pages is too high then we can get into dirty-data overload |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6cd0a8f90dc7..eb086e0f4dcc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si) | |||
139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | 139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); |
140 | if (nr_blocks) { | 140 | if (nr_blocks) { |
141 | err = blkdev_issue_discard(si->bdev, start_block, | 141 | err = blkdev_issue_discard(si->bdev, start_block, |
142 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | 142 | nr_blocks, GFP_KERNEL, |
143 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); | ||
143 | if (err) | 144 | if (err) |
144 | return err; | 145 | return err; |
145 | cond_resched(); | 146 | cond_resched(); |
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si) | |||
150 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 151 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
151 | 152 | ||
152 | err = blkdev_issue_discard(si->bdev, start_block, | 153 | err = blkdev_issue_discard(si->bdev, start_block, |
153 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | 154 | nr_blocks, GFP_KERNEL, |
155 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); | ||
154 | if (err) | 156 | if (err) |
155 | break; | 157 | break; |
156 | 158 | ||
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
189 | start_block <<= PAGE_SHIFT - 9; | 191 | start_block <<= PAGE_SHIFT - 9; |
190 | nr_blocks <<= PAGE_SHIFT - 9; | 192 | nr_blocks <<= PAGE_SHIFT - 9; |
191 | if (blkdev_issue_discard(si->bdev, start_block, | 193 | if (blkdev_issue_discard(si->bdev, start_block, |
192 | nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) | 194 | nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT | |
195 | BLKDEV_IFL_BARRIER)) | ||
193 | break; | 196 | break; |
194 | } | 197 | } |
195 | 198 | ||
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c543dd252433..66d9c416851e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -1406,12 +1406,13 @@ new_page: | |||
1406 | /* | 1406 | /* |
1407 | * Fill page/offset/length into spd, if it can hold more pages. | 1407 | * Fill page/offset/length into spd, if it can hold more pages. |
1408 | */ | 1408 | */ |
1409 | static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, | 1409 | static inline int spd_fill_page(struct splice_pipe_desc *spd, |
1410 | struct pipe_inode_info *pipe, struct page *page, | ||
1410 | unsigned int *len, unsigned int offset, | 1411 | unsigned int *len, unsigned int offset, |
1411 | struct sk_buff *skb, int linear, | 1412 | struct sk_buff *skb, int linear, |
1412 | struct sock *sk) | 1413 | struct sock *sk) |
1413 | { | 1414 | { |
1414 | if (unlikely(spd->nr_pages == PIPE_BUFFERS)) | 1415 | if (unlikely(spd->nr_pages == pipe->buffers)) |
1415 | return 1; | 1416 | return 1; |
1416 | 1417 | ||
1417 | if (linear) { | 1418 | if (linear) { |
@@ -1447,7 +1448,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff, | |||
1447 | unsigned int plen, unsigned int *off, | 1448 | unsigned int plen, unsigned int *off, |
1448 | unsigned int *len, struct sk_buff *skb, | 1449 | unsigned int *len, struct sk_buff *skb, |
1449 | struct splice_pipe_desc *spd, int linear, | 1450 | struct splice_pipe_desc *spd, int linear, |
1450 | struct sock *sk) | 1451 | struct sock *sk, |
1452 | struct pipe_inode_info *pipe) | ||
1451 | { | 1453 | { |
1452 | if (!*len) | 1454 | if (!*len) |
1453 | return 1; | 1455 | return 1; |
@@ -1470,7 +1472,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, | |||
1470 | /* the linear region may spread across several pages */ | 1472 | /* the linear region may spread across several pages */ |
1471 | flen = min_t(unsigned int, flen, PAGE_SIZE - poff); | 1473 | flen = min_t(unsigned int, flen, PAGE_SIZE - poff); |
1472 | 1474 | ||
1473 | if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) | 1475 | if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) |
1474 | return 1; | 1476 | return 1; |
1475 | 1477 | ||
1476 | __segment_seek(&page, &poff, &plen, flen); | 1478 | __segment_seek(&page, &poff, &plen, flen); |
@@ -1485,9 +1487,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff, | |||
1485 | * Map linear and fragment data from the skb to spd. It reports failure if the | 1487 | * Map linear and fragment data from the skb to spd. It reports failure if the |
1486 | * pipe is full or if we already spliced the requested length. | 1488 | * pipe is full or if we already spliced the requested length. |
1487 | */ | 1489 | */ |
1488 | static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, | 1490 | static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, |
1489 | unsigned int *len, struct splice_pipe_desc *spd, | 1491 | unsigned int *offset, unsigned int *len, |
1490 | struct sock *sk) | 1492 | struct splice_pipe_desc *spd, struct sock *sk) |
1491 | { | 1493 | { |
1492 | int seg; | 1494 | int seg; |
1493 | 1495 | ||
@@ -1497,7 +1499,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, | |||
1497 | if (__splice_segment(virt_to_page(skb->data), | 1499 | if (__splice_segment(virt_to_page(skb->data), |
1498 | (unsigned long) skb->data & (PAGE_SIZE - 1), | 1500 | (unsigned long) skb->data & (PAGE_SIZE - 1), |
1499 | skb_headlen(skb), | 1501 | skb_headlen(skb), |
1500 | offset, len, skb, spd, 1, sk)) | 1502 | offset, len, skb, spd, 1, sk, pipe)) |
1501 | return 1; | 1503 | return 1; |
1502 | 1504 | ||
1503 | /* | 1505 | /* |
@@ -1507,7 +1509,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, | |||
1507 | const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; | 1509 | const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; |
1508 | 1510 | ||
1509 | if (__splice_segment(f->page, f->page_offset, f->size, | 1511 | if (__splice_segment(f->page, f->page_offset, f->size, |
1510 | offset, len, skb, spd, 0, sk)) | 1512 | offset, len, skb, spd, 0, sk, pipe)) |
1511 | return 1; | 1513 | return 1; |
1512 | } | 1514 | } |
1513 | 1515 | ||
@@ -1524,8 +1526,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, | |||
1524 | struct pipe_inode_info *pipe, unsigned int tlen, | 1526 | struct pipe_inode_info *pipe, unsigned int tlen, |
1525 | unsigned int flags) | 1527 | unsigned int flags) |
1526 | { | 1528 | { |
1527 | struct partial_page partial[PIPE_BUFFERS]; | 1529 | struct partial_page partial[PIPE_DEF_BUFFERS]; |
1528 | struct page *pages[PIPE_BUFFERS]; | 1530 | struct page *pages[PIPE_DEF_BUFFERS]; |
1529 | struct splice_pipe_desc spd = { | 1531 | struct splice_pipe_desc spd = { |
1530 | .pages = pages, | 1532 | .pages = pages, |
1531 | .partial = partial, | 1533 | .partial = partial, |
@@ -1535,12 +1537,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, | |||
1535 | }; | 1537 | }; |
1536 | struct sk_buff *frag_iter; | 1538 | struct sk_buff *frag_iter; |
1537 | struct sock *sk = skb->sk; | 1539 | struct sock *sk = skb->sk; |
1540 | int ret = 0; | ||
1541 | |||
1542 | if (splice_grow_spd(pipe, &spd)) | ||
1543 | return -ENOMEM; | ||
1538 | 1544 | ||
1539 | /* | 1545 | /* |
1540 | * __skb_splice_bits() only fails if the output has no room left, | 1546 | * __skb_splice_bits() only fails if the output has no room left, |
1541 | * so no point in going over the frag_list for the error case. | 1547 | * so no point in going over the frag_list for the error case. |
1542 | */ | 1548 | */ |
1543 | if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) | 1549 | if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) |
1544 | goto done; | 1550 | goto done; |
1545 | else if (!tlen) | 1551 | else if (!tlen) |
1546 | goto done; | 1552 | goto done; |
@@ -1551,14 +1557,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, | |||
1551 | skb_walk_frags(skb, frag_iter) { | 1557 | skb_walk_frags(skb, frag_iter) { |
1552 | if (!tlen) | 1558 | if (!tlen) |
1553 | break; | 1559 | break; |
1554 | if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk)) | 1560 | if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) |
1555 | break; | 1561 | break; |
1556 | } | 1562 | } |
1557 | 1563 | ||
1558 | done: | 1564 | done: |
1559 | if (spd.nr_pages) { | 1565 | if (spd.nr_pages) { |
1560 | int ret; | ||
1561 | |||
1562 | /* | 1566 | /* |
1563 | * Drop the socket lock, otherwise we have reverse | 1567 | * Drop the socket lock, otherwise we have reverse |
1564 | * locking dependencies between sk_lock and i_mutex | 1568 | * locking dependencies between sk_lock and i_mutex |
@@ -1571,10 +1575,10 @@ done: | |||
1571 | release_sock(sk); | 1575 | release_sock(sk); |
1572 | ret = splice_to_pipe(pipe, &spd); | 1576 | ret = splice_to_pipe(pipe, &spd); |
1573 | lock_sock(sk); | 1577 | lock_sock(sk); |
1574 | return ret; | ||
1575 | } | 1578 | } |
1576 | 1579 | ||
1577 | return 0; | 1580 | splice_shrink_spd(pipe, &spd); |
1581 | return ret; | ||
1578 | } | 1582 | } |
1579 | 1583 | ||
1580 | /** | 1584 | /** |