diff options
| author | Martin K. Petersen <martin.petersen@oracle.com> | 2009-05-22 17:17:53 -0400 |
|---|---|---|
| committer | Jens Axboe <jens.axboe@oracle.com> | 2009-05-22 17:22:55 -0400 |
| commit | c72758f33784e5e2a1a4bb9421ef3e6de8f9fcf3 (patch) | |
| tree | a83f7540cc894caafe74db911cba3998d6a9a164 /block | |
| parent | cd43e26f071524647e660706b784ebcbefbd2e44 (diff) | |
block: Export I/O topology for block devices and partitions
To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment. This patch adds support for exposing
I/O topology characteristics as devices are stacked.
logical_block_size is the smallest unit the device can address.
physical_block_size indicates the smallest I/O the device can write
without incurring a read-modify-write penalty.
The io_min parameter is the smallest preferred I/O size reported by
the device. In many cases this is the same as the physical block
size. However, the io_min parameter can be scaled up when stacking
(RAID5 chunk size > physical block size).
The io_opt characteristic indicates the optimal I/O size reported by
the device. This is usually the stripe width for arrays.
The alignment_offset parameter indicates the number of bytes the start
of the device/partition is offset from the device's natural alignment.
Partition tools and MD/DM utilities can use this to pad their offsets
so filesystems start on proper boundaries.
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'block')
| -rw-r--r-- | block/blk-settings.c | 186 | ||||
| -rw-r--r-- | block/blk-sysfs.c | 33 | ||||
| -rw-r--r-- | block/genhd.c | 11 |
3 files changed, 230 insertions, 0 deletions
diff --git a/block/blk-settings.c b/block/blk-settings.c index b0f547cecfb8..5649f34adb40 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
| @@ -309,9 +309,94 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); | |||
| 309 | void blk_queue_logical_block_size(struct request_queue *q, unsigned short size) | 309 | void blk_queue_logical_block_size(struct request_queue *q, unsigned short size) |
| 310 | { | 310 | { |
| 311 | q->limits.logical_block_size = size; | 311 | q->limits.logical_block_size = size; |
| 312 | |||
| 313 | if (q->limits.physical_block_size < size) | ||
| 314 | q->limits.physical_block_size = size; | ||
| 315 | |||
| 316 | if (q->limits.io_min < q->limits.physical_block_size) | ||
| 317 | q->limits.io_min = q->limits.physical_block_size; | ||
| 312 | } | 318 | } |
| 313 | EXPORT_SYMBOL(blk_queue_logical_block_size); | 319 | EXPORT_SYMBOL(blk_queue_logical_block_size); |
| 314 | 320 | ||
| 321 | /** | ||
| 322 | * blk_queue_physical_block_size - set physical block size for the queue | ||
| 323 | * @q: the request queue for the device | ||
| 324 | * @size: the physical block size, in bytes | ||
| 325 | * | ||
| 326 | * Description: | ||
| 327 | * This should be set to the lowest possible sector size that the | ||
| 328 | * hardware can operate on without reverting to read-modify-write | ||
| 329 | * operations. | ||
| 330 | */ | ||
| 331 | void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) | ||
| 332 | { | ||
| 333 | q->limits.physical_block_size = size; | ||
| 334 | |||
| 335 | if (q->limits.physical_block_size < q->limits.logical_block_size) | ||
| 336 | q->limits.physical_block_size = q->limits.logical_block_size; | ||
| 337 | |||
| 338 | if (q->limits.io_min < q->limits.physical_block_size) | ||
| 339 | q->limits.io_min = q->limits.physical_block_size; | ||
| 340 | } | ||
| 341 | EXPORT_SYMBOL(blk_queue_physical_block_size); | ||
| 342 | |||
| 343 | /** | ||
| 344 | * blk_queue_alignment_offset - set physical block alignment offset | ||
| 345 | * @q: the request queue for the device | ||
| 346 | * @alignment: alignment offset in bytes | ||
| 347 | * | ||
| 348 | * Description: | ||
| 349 | * Some devices are naturally misaligned to compensate for things like | ||
| 350 | * the legacy DOS partition table 63-sector offset. Low-level drivers | ||
| 351 | * should call this function for devices whose first sector is not | ||
| 352 | * naturally aligned. | ||
| 353 | */ | ||
| 354 | void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) | ||
| 355 | { | ||
| 356 | q->limits.alignment_offset = | ||
| 357 | offset & (q->limits.physical_block_size - 1); | ||
| 358 | q->limits.misaligned = 0; | ||
| 359 | } | ||
| 360 | EXPORT_SYMBOL(blk_queue_alignment_offset); | ||
| 361 | |||
| 362 | /** | ||
| 363 | * blk_queue_io_min - set minimum request size for the queue | ||
| 364 | * @q: the request queue for the device | ||
| 365 | * @io_min: smallest I/O size in bytes | ||
| 366 | * | ||
| 367 | * Description: | ||
| 368 | * Some devices have an internal block size bigger than the reported | ||
| 369 | * hardware sector size. This function can be used to signal the | ||
| 370 | * smallest I/O the device can perform without incurring a performance | ||
| 371 | * penalty. | ||
| 372 | */ | ||
| 373 | void blk_queue_io_min(struct request_queue *q, unsigned int min) | ||
| 374 | { | ||
| 375 | q->limits.io_min = min; | ||
| 376 | |||
| 377 | if (q->limits.io_min < q->limits.logical_block_size) | ||
| 378 | q->limits.io_min = q->limits.logical_block_size; | ||
| 379 | |||
| 380 | if (q->limits.io_min < q->limits.physical_block_size) | ||
| 381 | q->limits.io_min = q->limits.physical_block_size; | ||
| 382 | } | ||
| 383 | EXPORT_SYMBOL(blk_queue_io_min); | ||
| 384 | |||
| 385 | /** | ||
| 386 | * blk_queue_io_opt - set optimal request size for the queue | ||
| 387 | * @q: the request queue for the device | ||
| 388 | * @io_opt: optimal request size in bytes | ||
| 389 | * | ||
| 390 | * Description: | ||
| 391 | * Drivers can call this function to set the preferred I/O request | ||
| 392 | * size for devices that report such a value. | ||
| 393 | */ | ||
| 394 | void blk_queue_io_opt(struct request_queue *q, unsigned int opt) | ||
| 395 | { | ||
| 396 | q->limits.io_opt = opt; | ||
| 397 | } | ||
| 398 | EXPORT_SYMBOL(blk_queue_io_opt); | ||
| 399 | |||
| 315 | /* | 400 | /* |
| 316 | * Returns the minimum that is _not_ zero, unless both are zero. | 401 | * Returns the minimum that is _not_ zero, unless both are zero. |
| 317 | */ | 402 | */ |
| @@ -358,6 +443,107 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) | |||
| 358 | EXPORT_SYMBOL(blk_queue_stack_limits); | 443 | EXPORT_SYMBOL(blk_queue_stack_limits); |
| 359 | 444 | ||
| 360 | /** | 445 | /** |
| 446 | * blk_stack_limits - adjust queue_limits for stacked devices | ||
| 447 | * @t: the stacking driver limits (top) | ||
| 448 | * @bdev: the underlying queue limits (bottom) | ||
| 449 | * @offset: offset to beginning of data within component device | ||
| 450 | * | ||
| 451 | * Description: | ||
| 452 | * Merges two queue_limit structs. Returns 0 if alignment didn't | ||
| 453 | * change. Returns -1 if adding the bottom device caused | ||
| 454 | * misalignment. | ||
| 455 | */ | ||
| 456 | int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | ||
| 457 | sector_t offset) | ||
| 458 | { | ||
| 459 | t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); | ||
| 460 | t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); | ||
| 461 | |||
| 462 | t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, | ||
| 463 | b->seg_boundary_mask); | ||
| 464 | |||
| 465 | t->max_phys_segments = min_not_zero(t->max_phys_segments, | ||
| 466 | b->max_phys_segments); | ||
| 467 | |||
| 468 | t->max_hw_segments = min_not_zero(t->max_hw_segments, | ||
| 469 | b->max_hw_segments); | ||
| 470 | |||
| 471 | t->max_segment_size = min_not_zero(t->max_segment_size, | ||
| 472 | b->max_segment_size); | ||
| 473 | |||
| 474 | t->logical_block_size = max(t->logical_block_size, | ||
| 475 | b->logical_block_size); | ||
| 476 | |||
| 477 | t->physical_block_size = max(t->physical_block_size, | ||
| 478 | b->physical_block_size); | ||
| 479 | |||
| 480 | t->io_min = max(t->io_min, b->io_min); | ||
| 481 | t->no_cluster |= b->no_cluster; | ||
| 482 | |||
| 483 | /* Bottom device offset aligned? */ | ||
| 484 | if (offset && | ||
| 485 | (offset & (b->physical_block_size - 1)) != b->alignment_offset) { | ||
| 486 | t->misaligned = 1; | ||
| 487 | return -1; | ||
| 488 | } | ||
| 489 | |||
| 490 | /* If top has no alignment offset, inherit from bottom */ | ||
| 491 | if (!t->alignment_offset) | ||
| 492 | t->alignment_offset = | ||
| 493 | b->alignment_offset & (b->physical_block_size - 1); | ||
| 494 | |||
| 495 | /* Top device aligned on logical block boundary? */ | ||
| 496 | if (t->alignment_offset & (t->logical_block_size - 1)) { | ||
| 497 | t->misaligned = 1; | ||
| 498 | return -1; | ||
| 499 | } | ||
| 500 | |||
| 501 | return 0; | ||
| 502 | } | ||
| 503 | |||
| 504 | /** | ||
| 505 | * disk_stack_limits - adjust queue limits for stacked drivers | ||
| 506 | * @t: MD/DM gendisk (top) | ||
| 507 | * @bdev: the underlying block device (bottom) | ||
| 508 | * @offset: offset to beginning of data within component device | ||
| 509 | * | ||
| 510 | * Description: | ||
| 511 | * Merges the limits for two queues. Returns 0 if alignment | ||
| 512 | * didn't change. Returns -1 if adding the bottom device caused | ||
| 513 | * misalignment. | ||
| 514 | */ | ||
| 515 | void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, | ||
| 516 | sector_t offset) | ||
| 517 | { | ||
| 518 | struct request_queue *t = disk->queue; | ||
| 519 | struct request_queue *b = bdev_get_queue(bdev); | ||
| 520 | |||
| 521 | offset += get_start_sect(bdev) << 9; | ||
| 522 | |||
| 523 | if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { | ||
| 524 | char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; | ||
| 525 | |||
| 526 | disk_name(disk, 0, top); | ||
| 527 | bdevname(bdev, bottom); | ||
| 528 | |||
| 529 | printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", | ||
| 530 | top, bottom); | ||
| 531 | } | ||
| 532 | |||
| 533 | if (!t->queue_lock) | ||
| 534 | WARN_ON_ONCE(1); | ||
| 535 | else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { | ||
| 536 | unsigned long flags; | ||
| 537 | |||
| 538 | spin_lock_irqsave(t->queue_lock, flags); | ||
| 539 | if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) | ||
| 540 | queue_flag_clear(QUEUE_FLAG_CLUSTER, t); | ||
| 541 | spin_unlock_irqrestore(t->queue_lock, flags); | ||
| 542 | } | ||
| 543 | } | ||
| 544 | EXPORT_SYMBOL(disk_stack_limits); | ||
| 545 | |||
| 546 | /** | ||
| 361 | * blk_queue_dma_pad - set pad mask | 547 | * blk_queue_dma_pad - set pad mask |
| 362 | * @q: the request queue for the device | 548 | * @q: the request queue for the device |
| 363 | * @mask: pad mask | 549 | * @mask: pad mask |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3ccdadb8e204..9337e17f9110 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
| @@ -105,6 +105,21 @@ static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page | |||
| 105 | return queue_var_show(queue_logical_block_size(q), page); | 105 | return queue_var_show(queue_logical_block_size(q), page); |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page) | ||
| 109 | { | ||
| 110 | return queue_var_show(queue_physical_block_size(q), page); | ||
| 111 | } | ||
| 112 | |||
| 113 | static ssize_t queue_io_min_show(struct request_queue *q, char *page) | ||
| 114 | { | ||
| 115 | return queue_var_show(queue_io_min(q), page); | ||
| 116 | } | ||
| 117 | |||
| 118 | static ssize_t queue_io_opt_show(struct request_queue *q, char *page) | ||
| 119 | { | ||
| 120 | return queue_var_show(queue_io_opt(q), page); | ||
| 121 | } | ||
| 122 | |||
| 108 | static ssize_t | 123 | static ssize_t |
| 109 | queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) | 124 | queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) |
| 110 | { | 125 | { |
| @@ -257,6 +272,21 @@ static struct queue_sysfs_entry queue_logical_block_size_entry = { | |||
| 257 | .show = queue_logical_block_size_show, | 272 | .show = queue_logical_block_size_show, |
| 258 | }; | 273 | }; |
| 259 | 274 | ||
| 275 | static struct queue_sysfs_entry queue_physical_block_size_entry = { | ||
| 276 | .attr = {.name = "physical_block_size", .mode = S_IRUGO }, | ||
| 277 | .show = queue_physical_block_size_show, | ||
| 278 | }; | ||
| 279 | |||
| 280 | static struct queue_sysfs_entry queue_io_min_entry = { | ||
| 281 | .attr = {.name = "minimum_io_size", .mode = S_IRUGO }, | ||
| 282 | .show = queue_io_min_show, | ||
| 283 | }; | ||
| 284 | |||
| 285 | static struct queue_sysfs_entry queue_io_opt_entry = { | ||
| 286 | .attr = {.name = "optimal_io_size", .mode = S_IRUGO }, | ||
| 287 | .show = queue_io_opt_show, | ||
| 288 | }; | ||
| 289 | |||
| 260 | static struct queue_sysfs_entry queue_nonrot_entry = { | 290 | static struct queue_sysfs_entry queue_nonrot_entry = { |
| 261 | .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, | 291 | .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, |
| 262 | .show = queue_nonrot_show, | 292 | .show = queue_nonrot_show, |
| @@ -289,6 +319,9 @@ static struct attribute *default_attrs[] = { | |||
| 289 | &queue_iosched_entry.attr, | 319 | &queue_iosched_entry.attr, |
| 290 | &queue_hw_sector_size_entry.attr, | 320 | &queue_hw_sector_size_entry.attr, |
| 291 | &queue_logical_block_size_entry.attr, | 321 | &queue_logical_block_size_entry.attr, |
| 322 | &queue_physical_block_size_entry.attr, | ||
| 323 | &queue_io_min_entry.attr, | ||
| 324 | &queue_io_opt_entry.attr, | ||
| 292 | &queue_nonrot_entry.attr, | 325 | &queue_nonrot_entry.attr, |
| 293 | &queue_nomerges_entry.attr, | 326 | &queue_nomerges_entry.attr, |
| 294 | &queue_rq_affinity_entry.attr, | 327 | &queue_rq_affinity_entry.attr, |
diff --git a/block/genhd.c b/block/genhd.c index 1a4916e01732..fe7ccc0a618f 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
| @@ -852,11 +852,21 @@ static ssize_t disk_capability_show(struct device *dev, | |||
| 852 | return sprintf(buf, "%x\n", disk->flags); | 852 | return sprintf(buf, "%x\n", disk->flags); |
| 853 | } | 853 | } |
| 854 | 854 | ||
| 855 | static ssize_t disk_alignment_offset_show(struct device *dev, | ||
| 856 | struct device_attribute *attr, | ||
| 857 | char *buf) | ||
| 858 | { | ||
| 859 | struct gendisk *disk = dev_to_disk(dev); | ||
| 860 | |||
| 861 | return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); | ||
| 862 | } | ||
| 863 | |||
| 855 | static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); | 864 | static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); |
| 856 | static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); | 865 | static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); |
| 857 | static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); | 866 | static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); |
| 858 | static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); | 867 | static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); |
| 859 | static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); | 868 | static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); |
| 869 | static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); | ||
| 860 | static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); | 870 | static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); |
| 861 | static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); | 871 | static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); |
| 862 | #ifdef CONFIG_FAIL_MAKE_REQUEST | 872 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
| @@ -875,6 +885,7 @@ static struct attribute *disk_attrs[] = { | |||
| 875 | &dev_attr_removable.attr, | 885 | &dev_attr_removable.attr, |
| 876 | &dev_attr_ro.attr, | 886 | &dev_attr_ro.attr, |
| 877 | &dev_attr_size.attr, | 887 | &dev_attr_size.attr, |
| 888 | &dev_attr_alignment_offset.attr, | ||
| 878 | &dev_attr_capability.attr, | 889 | &dev_attr_capability.attr, |
| 879 | &dev_attr_stat.attr, | 890 | &dev_attr_stat.attr, |
| 880 | #ifdef CONFIG_FAIL_MAKE_REQUEST | 891 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
