diff options
author | Jerome Marchand <jmarchan@redhat.com> | 2011-01-05 10:57:38 -0500 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2011-01-05 10:57:38 -0500 |
commit | 09e099d4bafea3b15be003d548bdf94b4b6e0e17 (patch) | |
tree | a4199338ad73e88c0863bbfc6604c4972055f16d /block | |
parent | e4a683c899cd5a49f8d684a054c95bd115a0c005 (diff) |
block: fix accounting bug on cross partition merges
/proc/diskstats would display a strange output as follows.
$ cat /proc/diskstats |grep sda
8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
~~~~~~~~~~
8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92
8 4 sda4 4 0 8 0 0 0 0 0 0 0 0
8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137
Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.
The detailed root cause is as follows.
Assuming that there are two partition, sda1 and sda2.
1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
is 0 and sda2's one is 1.
| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------
2. A bio belongs to sda1 is issued and is merged into the request mentioned on
step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
from sda2 region to sda1 region. However the two partition's
hd_struct->in_flight are not changed.
| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------
3. The request is finished and blk_account_io_done() is called. In this case,
sda2's hd_struct->in_flight, not a sda1's one, is decremented.
| hd_struct->in_flight
---------------------------
sda1 | -1
sda2 | 1
---------------------------
The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.
Also add a refcount to struct hd_struct to keep the partition in
memory as long as users exist. We use kref_test_and_get() to ensure
we don't add a reference to a partition which is going away.
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'block')
-rw-r--r-- | block/blk-core.c | 26 | ||||
-rw-r--r-- | block/blk-merge.c | 3 | ||||
-rw-r--r-- | block/genhd.c | 1 |
3 files changed, 24 insertions, 6 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index 3689319a597..500c080a6a6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io) | |||
64 | return; | 64 | return; |
65 | 65 | ||
66 | cpu = part_stat_lock(); | 66 | cpu = part_stat_lock(); |
67 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | ||
68 | 67 | ||
69 | if (!new_io) | 68 | if (!new_io) { |
69 | part = rq->part; | ||
70 | part_stat_inc(cpu, part, merges[rw]); | 70 | part_stat_inc(cpu, part, merges[rw]); |
71 | else { | 71 | } else { |
72 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | ||
73 | if (!kref_test_and_get(&part->ref)) { | ||
74 | /* | ||
75 | * The partition is already being removed, | ||
76 | * the request will be accounted on the disk only | ||
77 | * | ||
78 | * We take a reference on disk->part0 although that | ||
79 | * partition will never be deleted, so we can treat | ||
80 | * it as any other partition. | ||
81 | */ | ||
82 | part = &rq->rq_disk->part0; | ||
83 | kref_get(&part->ref); | ||
84 | } | ||
72 | part_round_stats(cpu, part); | 85 | part_round_stats(cpu, part); |
73 | part_inc_in_flight(part, rw); | 86 | part_inc_in_flight(part, rw); |
87 | rq->part = part; | ||
74 | } | 88 | } |
75 | 89 | ||
76 | part_stat_unlock(); | 90 | part_stat_unlock(); |
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |||
128 | rq->ref_count = 1; | 142 | rq->ref_count = 1; |
129 | rq->start_time = jiffies; | 143 | rq->start_time = jiffies; |
130 | set_start_time_ns(rq); | 144 | set_start_time_ns(rq); |
145 | rq->part = NULL; | ||
131 | } | 146 | } |
132 | EXPORT_SYMBOL(blk_rq_init); | 147 | EXPORT_SYMBOL(blk_rq_init); |
133 | 148 | ||
@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) | |||
1776 | int cpu; | 1791 | int cpu; |
1777 | 1792 | ||
1778 | cpu = part_stat_lock(); | 1793 | cpu = part_stat_lock(); |
1779 | part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | 1794 | part = req->part; |
1780 | part_stat_add(cpu, part, sectors[rw], bytes >> 9); | 1795 | part_stat_add(cpu, part, sectors[rw], bytes >> 9); |
1781 | part_stat_unlock(); | 1796 | part_stat_unlock(); |
1782 | } | 1797 | } |
@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req) | |||
1796 | int cpu; | 1811 | int cpu; |
1797 | 1812 | ||
1798 | cpu = part_stat_lock(); | 1813 | cpu = part_stat_lock(); |
1799 | part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | 1814 | part = req->part; |
1800 | 1815 | ||
1801 | part_stat_inc(cpu, part, ios[rw]); | 1816 | part_stat_inc(cpu, part, ios[rw]); |
1802 | part_stat_add(cpu, part, ticks[rw], duration); | 1817 | part_stat_add(cpu, part, ticks[rw], duration); |
1803 | part_round_stats(cpu, part); | 1818 | part_round_stats(cpu, part); |
1804 | part_dec_in_flight(part, rw); | 1819 | part_dec_in_flight(part, rw); |
1805 | 1820 | ||
1821 | kref_put(&part->ref, __delete_partition); | ||
1806 | part_stat_unlock(); | 1822 | part_stat_unlock(); |
1807 | } | 1823 | } |
1808 | } | 1824 | } |
diff --git a/block/blk-merge.c b/block/blk-merge.c index 77b7c26df6b..b06b83b89d8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req) | |||
351 | int cpu; | 351 | int cpu; |
352 | 352 | ||
353 | cpu = part_stat_lock(); | 353 | cpu = part_stat_lock(); |
354 | part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | 354 | part = req->part; |
355 | 355 | ||
356 | part_round_stats(cpu, part); | 356 | part_round_stats(cpu, part); |
357 | part_dec_in_flight(part, rq_data_dir(req)); | 357 | part_dec_in_flight(part, rq_data_dir(req)); |
358 | 358 | ||
359 | kref_put(&part->ref, __delete_partition); | ||
359 | part_stat_unlock(); | 360 | part_stat_unlock(); |
360 | } | 361 | } |
361 | } | 362 | } |
diff --git a/block/genhd.c b/block/genhd.c index 16ccc0d2d5d..85c15059883 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) | |||
1192 | return NULL; | 1192 | return NULL; |
1193 | } | 1193 | } |
1194 | disk->part_tbl->part[0] = &disk->part0; | 1194 | disk->part_tbl->part[0] = &disk->part0; |
1195 | kref_init(&disk->part0.ref); | ||
1195 | 1196 | ||
1196 | disk->minors = minors; | 1197 | disk->minors = minors; |
1197 | rand_initialize_disk(disk); | 1198 | rand_initialize_disk(disk); |