aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>2010-10-19 03:05:00 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-10-19 03:07:02 -0400
commit7681bfeeccff5efa9eb29bf09249a3c400b15327 (patch)
tree8557964a2df96e253dcf1a61734b98dbfbf192d6
parent495d2b3883682fcd1c3dee3a45e38fd00154ae25 (diff)
block: fix accounting bug on cross partition merges
/proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. When reloading partition tables, quiesce IO to ensure that no request references to the partition struct exists. When it is safe to free the partition table, the IO for that device is restarted again. Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: stable@kernel.org Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--block/blk-core.c24
-rw-r--r--block/blk-merge.c2
-rw-r--r--block/blk.h4
-rw-r--r--block/genhd.c14
-rw-r--r--fs/partitions/check.c12
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--include/linux/elevator.h2
-rw-r--r--include/linux/genhd.h1
8 files changed, 47 insertions, 13 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 797d5095eb83..ddc68332d655 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
64 return; 64 return;
65 65
66 cpu = part_stat_lock(); 66 cpu = part_stat_lock();
67 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 67
69 if (!new_io) 68 if (!new_io) {
69 part = rq->part;
70 part_stat_inc(cpu, part, merges[rw]); 70 part_stat_inc(cpu, part, merges[rw]);
71 else { 71 } else {
72 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
72 part_round_stats(cpu, part); 73 part_round_stats(cpu, part);
73 part_inc_in_flight(part, rw); 74 part_inc_in_flight(part, rw);
75 rq->part = part;
74 } 76 }
75 77
76 part_stat_unlock(); 78 part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
128 rq->ref_count = 1; 130 rq->ref_count = 1;
129 rq->start_time = jiffies; 131 rq->start_time = jiffies;
130 set_start_time_ns(rq); 132 set_start_time_ns(rq);
133 rq->part = NULL;
131} 134}
132EXPORT_SYMBOL(blk_rq_init); 135EXPORT_SYMBOL(blk_rq_init);
133 136
@@ -804,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
804 rl->starved[is_sync] = 0; 807 rl->starved[is_sync] = 0;
805 808
806 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 809 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
807 if (priv) 810 if (priv) {
808 rl->elvpriv++; 811 rl->elvpriv++;
809 812
810 if (blk_queue_io_stat(q)) 813 /*
811 rw_flags |= REQ_IO_STAT; 814 * Don't do stats for non-priv requests
815 */
816 if (blk_queue_io_stat(q))
817 rw_flags |= REQ_IO_STAT;
818 }
819
812 spin_unlock_irq(q->queue_lock); 820 spin_unlock_irq(q->queue_lock);
813 821
814 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 822 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1777,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1777 int cpu; 1785 int cpu;
1778 1786
1779 cpu = part_stat_lock(); 1787 cpu = part_stat_lock();
1780 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1788 part = req->part;
1781 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 1789 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1782 part_stat_unlock(); 1790 part_stat_unlock();
1783 } 1791 }
@@ -1797,7 +1805,7 @@ static void blk_account_io_done(struct request *req)
1797 int cpu; 1805 int cpu;
1798 1806
1799 cpu = part_stat_lock(); 1807 cpu = part_stat_lock();
1800 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1808 part = req->part;
1801 1809
1802 part_stat_inc(cpu, part, ios[rw]); 1810 part_stat_inc(cpu, part, ios[rw]);
1803 part_stat_add(cpu, part, ticks[rw], duration); 1811 part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6a725461654d..38ff234012a4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
351 int cpu; 351 int cpu;
352 352
353 cpu = part_stat_lock(); 353 cpu = part_stat_lock();
354 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 354 part = req->part;
355 355
356 part_round_stats(cpu, part); 356 part_round_stats(cpu, part);
357 part_dec_in_flight(part, rq_data_dir(req)); 357 part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 6738831ba447..1340cce5721a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
110 110
111int blk_dev_init(void); 111int blk_dev_init(void);
112 112
113void elv_quiesce_start(struct request_queue *q);
114void elv_quiesce_end(struct request_queue *q);
115
116
117/* 113/*
118 * Return the threshold (number of used requests) at which the queue is 114 * Return the threshold (number of used requests) at which the queue is
119 * considered to be congested. It include a little hysteresis to keep the 115 * considered to be congested. It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index 7923e720ddf5..8313834596db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -932,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
932{ 932{
933 struct disk_part_tbl *ptbl = 933 struct disk_part_tbl *ptbl =
934 container_of(head, struct disk_part_tbl, rcu_head); 934 container_of(head, struct disk_part_tbl, rcu_head);
935 struct gendisk *disk = ptbl->disk;
936 struct request_queue *q = disk->queue;
937 unsigned long flags;
935 938
936 kfree(ptbl); 939 kfree(ptbl);
940
941 spin_lock_irqsave(q->queue_lock, flags);
942 elv_quiesce_end(q);
943 spin_unlock_irqrestore(q->queue_lock, flags);
937} 944}
938 945
939/** 946/**
@@ -951,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
951 struct disk_part_tbl *new_ptbl) 958 struct disk_part_tbl *new_ptbl)
952{ 959{
953 struct disk_part_tbl *old_ptbl = disk->part_tbl; 960 struct disk_part_tbl *old_ptbl = disk->part_tbl;
961 struct request_queue *q = disk->queue;
954 962
955 rcu_assign_pointer(disk->part_tbl, new_ptbl); 963 rcu_assign_pointer(disk->part_tbl, new_ptbl);
956 964
957 if (old_ptbl) { 965 if (old_ptbl) {
958 rcu_assign_pointer(old_ptbl->last_lookup, NULL); 966 rcu_assign_pointer(old_ptbl->last_lookup, NULL);
967
968 spin_lock_irq(q->queue_lock);
969 elv_quiesce_start(q);
970 spin_unlock_irq(q->queue_lock);
971
959 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); 972 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
960 } 973 }
961} 974}
@@ -996,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
996 return -ENOMEM; 1009 return -ENOMEM;
997 1010
998 new_ptbl->len = target; 1011 new_ptbl->len = target;
1012 new_ptbl->disk = disk;
999 1013
1000 for (i = 0; i < len; i++) 1014 for (i = 0; i < len; i++)
1001 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); 1015 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6dfbee03ccc6..30f46c2cb9d5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -365,17 +365,25 @@ struct device_type part_type = {
365static void delete_partition_rcu_cb(struct rcu_head *head) 365static void delete_partition_rcu_cb(struct rcu_head *head)
366{ 366{
367 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); 367 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
368 struct gendisk *disk = part_to_disk(part);
369 struct request_queue *q = disk->queue;
370 unsigned long flags;
368 371
369 part->start_sect = 0; 372 part->start_sect = 0;
370 part->nr_sects = 0; 373 part->nr_sects = 0;
371 part_stat_set_all(part, 0); 374 part_stat_set_all(part, 0);
372 put_device(part_to_dev(part)); 375 put_device(part_to_dev(part));
376
377 spin_lock_irqsave(q->queue_lock, flags);
378 elv_quiesce_end(q);
379 spin_unlock_irqrestore(q->queue_lock, flags);
373} 380}
374 381
375void delete_partition(struct gendisk *disk, int partno) 382void delete_partition(struct gendisk *disk, int partno)
376{ 383{
377 struct disk_part_tbl *ptbl = disk->part_tbl; 384 struct disk_part_tbl *ptbl = disk->part_tbl;
378 struct hd_struct *part; 385 struct hd_struct *part;
386 struct request_queue *q = disk->queue;
379 387
380 if (partno >= ptbl->len) 388 if (partno >= ptbl->len)
381 return; 389 return;
@@ -390,6 +398,10 @@ void delete_partition(struct gendisk *disk, int partno)
390 kobject_put(part->holder_dir); 398 kobject_put(part->holder_dir);
391 device_del(part_to_dev(part)); 399 device_del(part_to_dev(part));
392 400
401 spin_lock_irq(q->queue_lock);
402 elv_quiesce_start(q);
403 spin_unlock_irq(q->queue_lock);
404
393 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 405 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
394} 406}
395 407
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8f3dd981b973..16f7f1be1acf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
115 void *elevator_private3; 115 void *elevator_private3;
116 116
117 struct gendisk *rq_disk; 117 struct gendisk *rq_disk;
118 struct hd_struct *part;
118 unsigned long start_time; 119 unsigned long start_time;
119#ifdef CONFIG_BLK_CGROUP 120#ifdef CONFIG_BLK_CGROUP
120 unsigned long long start_time_ns; 121 unsigned long long start_time_ns;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2c958f4fce1e..df1ee866d715 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -121,6 +121,8 @@ extern void elv_completed_request(struct request_queue *, struct request *);
121extern int elv_set_request(struct request_queue *, struct request *, gfp_t); 121extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
122extern void elv_put_request(struct request_queue *, struct request *); 122extern void elv_put_request(struct request_queue *, struct request *);
123extern void elv_drain_elevator(struct request_queue *); 123extern void elv_drain_elevator(struct request_queue *);
124extern void elv_quiesce_start(struct request_queue *);
125extern void elv_quiesce_end(struct request_queue *);
124 126
125/* 127/*
126 * io scheduler registration 128 * io scheduler registration
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 66e26b5a1537..57647ecfc1bd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,6 +140,7 @@ struct disk_part_tbl {
140 struct rcu_head rcu_head; 140 struct rcu_head rcu_head;
141 int len; 141 int len;
142 struct hd_struct *last_lookup; 142 struct hd_struct *last_lookup;
143 struct gendisk *disk;
143 struct hd_struct *part[]; 144 struct hd_struct *part[];
144}; 145};
145 146