aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorJerome Marchand <jmarchan@redhat.com>2011-01-05 10:57:38 -0500
committerJens Axboe <jaxboe@fusionio.com>2011-01-05 10:57:38 -0500
commit09e099d4bafea3b15be003d548bdf94b4b6e0e17 (patch)
treea4199338ad73e88c0863bbfc6604c4972055f16d /block
parente4a683c899cd5a49f8d684a054c95bd115a0c005 (diff)
block: fix accounting bug on cross partition merges
/proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. Also add a refcount to struct hd_struct to keep the partition in memory as long as users exist. We use kref_test_and_get() to ensure we don't add a reference to a partition which is going away. Signed-off-by: Jerome Marchand <jmarchan@redhat.com> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: stable@kernel.org Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'block')
-rw-r--r--block/blk-core.c26
-rw-r--r--block/blk-merge.c3
-rw-r--r--block/genhd.c1
3 files changed, 24 insertions, 6 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 3689319a5974..500c080a6a6b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
64 return; 64 return;
65 65
66 cpu = part_stat_lock(); 66 cpu = part_stat_lock();
67 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 67
69 if (!new_io) 68 if (!new_io) {
69 part = rq->part;
70 part_stat_inc(cpu, part, merges[rw]); 70 part_stat_inc(cpu, part, merges[rw]);
71 else { 71 } else {
72 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
73 if (!kref_test_and_get(&part->ref)) {
74 /*
75 * The partition is already being removed,
76 * the request will be accounted on the disk only
77 *
78 * We take a reference on disk->part0 although that
79 * partition will never be deleted, so we can treat
80 * it as any other partition.
81 */
82 part = &rq->rq_disk->part0;
83 kref_get(&part->ref);
84 }
72 part_round_stats(cpu, part); 85 part_round_stats(cpu, part);
73 part_inc_in_flight(part, rw); 86 part_inc_in_flight(part, rw);
87 rq->part = part;
74 } 88 }
75 89
76 part_stat_unlock(); 90 part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
128 rq->ref_count = 1; 142 rq->ref_count = 1;
129 rq->start_time = jiffies; 143 rq->start_time = jiffies;
130 set_start_time_ns(rq); 144 set_start_time_ns(rq);
145 rq->part = NULL;
131} 146}
132EXPORT_SYMBOL(blk_rq_init); 147EXPORT_SYMBOL(blk_rq_init);
133 148
@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1776 int cpu; 1791 int cpu;
1777 1792
1778 cpu = part_stat_lock(); 1793 cpu = part_stat_lock();
1779 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1794 part = req->part;
1780 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 1795 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1781 part_stat_unlock(); 1796 part_stat_unlock();
1782 } 1797 }
@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req)
1796 int cpu; 1811 int cpu;
1797 1812
1798 cpu = part_stat_lock(); 1813 cpu = part_stat_lock();
1799 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1814 part = req->part;
1800 1815
1801 part_stat_inc(cpu, part, ios[rw]); 1816 part_stat_inc(cpu, part, ios[rw]);
1802 part_stat_add(cpu, part, ticks[rw], duration); 1817 part_stat_add(cpu, part, ticks[rw], duration);
1803 part_round_stats(cpu, part); 1818 part_round_stats(cpu, part);
1804 part_dec_in_flight(part, rw); 1819 part_dec_in_flight(part, rw);
1805 1820
1821 kref_put(&part->ref, __delete_partition);
1806 part_stat_unlock(); 1822 part_stat_unlock();
1807 } 1823 }
1808} 1824}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26df6b5..b06b83b89d89 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
351 int cpu; 351 int cpu;
352 352
353 cpu = part_stat_lock(); 353 cpu = part_stat_lock();
354 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 354 part = req->part;
355 355
356 part_round_stats(cpu, part); 356 part_round_stats(cpu, part);
357 part_dec_in_flight(part, rq_data_dir(req)); 357 part_dec_in_flight(part, rq_data_dir(req));
358 358
359 kref_put(&part->ref, __delete_partition);
359 part_stat_unlock(); 360 part_stat_unlock();
360 } 361 }
361} 362}
diff --git a/block/genhd.c b/block/genhd.c
index 16ccc0d2d5d3..85c150598830 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
1192 return NULL; 1192 return NULL;
1193 } 1193 }
1194 disk->part_tbl->part[0] = &disk->part0; 1194 disk->part_tbl->part[0] = &disk->part0;
1195 kref_init(&disk->part0.ref);
1195 1196
1196 disk->minors = minors; 1197 disk->minors = minors;
1197 rand_initialize_disk(disk); 1198 rand_initialize_disk(disk);