aboutsummaryrefslogtreecommitdiffstats
path: root/fs/partitions
diff options
context:
space:
mode:
authorJerome Marchand <jmarchan@redhat.com>2011-01-05 10:57:38 -0500
committerJens Axboe <jaxboe@fusionio.com>2011-01-05 10:57:38 -0500
commit09e099d4bafea3b15be003d548bdf94b4b6e0e17 (patch)
treea4199338ad73e88c0863bbfc6604c4972055f16d /fs/partitions
parente4a683c899cd5a49f8d684a054c95bd115a0c005 (diff)
block: fix accounting bug on cross partition merges
/proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. Also add a refcount to struct hd_struct to keep the partition in memory as long as users exist. We use kref_test_and_get() to ensure we don't add a reference to a partition which is going away. Signed-off-by: Jerome Marchand <jmarchan@redhat.com> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: stable@kernel.org Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'fs/partitions')
-rw-r--r--fs/partitions/check.c10
1 files changed, 9 insertions, 1 deletions
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index bdf8d3cc95a4..48209f58522b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -381,6 +381,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
381 put_device(part_to_dev(part)); 381 put_device(part_to_dev(part));
382} 382}
383 383
384void __delete_partition(struct kref *ref)
385{
386 struct hd_struct *part = container_of(ref, struct hd_struct, ref);
387
388 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
389}
390
384void delete_partition(struct gendisk *disk, int partno) 391void delete_partition(struct gendisk *disk, int partno)
385{ 392{
386 struct disk_part_tbl *ptbl = disk->part_tbl; 393 struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -399,7 +406,7 @@ void delete_partition(struct gendisk *disk, int partno)
399 kobject_put(part->holder_dir); 406 kobject_put(part->holder_dir);
400 device_del(part_to_dev(part)); 407 device_del(part_to_dev(part));
401 408
402 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 409 kref_put(&part->ref, __delete_partition);
403} 410}
404 411
405static ssize_t whole_disk_show(struct device *dev, 412static ssize_t whole_disk_show(struct device *dev,
@@ -498,6 +505,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
498 if (!dev_get_uevent_suppress(ddev)) 505 if (!dev_get_uevent_suppress(ddev))
499 kobject_uevent(&pdev->kobj, KOBJ_ADD); 506 kobject_uevent(&pdev->kobj, KOBJ_ADD);
500 507
508 kref_init(&p->ref);
501 return p; 509 return p;
502 510
503out_free_info: 511out_free_info: