aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2008-09-03 03:03:02 -0400
committerJens Axboe <jens.axboe@oracle.com>2008-10-09 02:56:06 -0400
commite71bf0d0ee89e51b92776391c5634938236977d5 (patch)
tree9fc62352a40ad388deebdd8ed497cab926cf0470 /block
parentf331c0296f2a9fee0d396a70598b954062603015 (diff)
block: fix disk->part[] dereferencing race
disk->part[] is protected by its matching bdev's lock. However, non-critical accesses like collecting stats and printing out sysfs and proc information used to be performed without any locking. As partitions can come and go dynamically, partitions can go away underneath those non-critical accesses. As some of those accesses are writes, this theoretically can lead to silent corruption. This patch fixes the race by using RCU for the partition array and dev reference counter to hold partitions. * Rename disk->part[] to disk->__part[] to make sure no one outside genhd layer proper accesses it directly. * Use RCU for disk->__part[] dereferencing. * Implement disk_{get|put}_part() which can be used to get and put partitions from gendisk respectively. * Iterators are implemented to help iterate through all partitions safely. * Functions which require RCU readlock are marked with _rcu suffix. * Use disk_put_part() in __blkdev_put() instead of directly putting the contained kobject. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'block')
-rw-r--r--block/blk-core.c20
-rw-r--r--block/blk-merge.c9
-rw-r--r--block/genhd.c218
-rw-r--r--block/ioctl.c26
4 files changed, 226 insertions, 47 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index a0dc2e72fcbb..d6128d9ad601 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -60,7 +60,9 @@ static void drive_stat_acct(struct request *rq, int new_io)
60 if (!blk_fs_request(rq) || !rq->rq_disk) 60 if (!blk_fs_request(rq) || !rq->rq_disk)
61 return; 61 return;
62 62
63 part = disk_map_sector(rq->rq_disk, rq->sector); 63 rcu_read_lock();
64
65 part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
64 if (!new_io) 66 if (!new_io)
65 __all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector); 67 __all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
66 else { 68 else {
@@ -71,6 +73,8 @@ static void drive_stat_acct(struct request *rq, int new_io)
71 part->in_flight++; 73 part->in_flight++;
72 } 74 }
73 } 75 }
76
77 rcu_read_unlock();
74} 78}
75 79
76void blk_queue_congestion_threshold(struct request_queue *q) 80void blk_queue_congestion_threshold(struct request_queue *q)
@@ -1557,12 +1561,14 @@ static int __end_that_request_first(struct request *req, int error,
1557 } 1561 }
1558 1562
1559 if (blk_fs_request(req) && req->rq_disk) { 1563 if (blk_fs_request(req) && req->rq_disk) {
1560 struct hd_struct *part =
1561 disk_map_sector(req->rq_disk, req->sector);
1562 const int rw = rq_data_dir(req); 1564 const int rw = rq_data_dir(req);
1565 struct hd_struct *part;
1563 1566
1567 rcu_read_lock();
1568 part = disk_map_sector_rcu(req->rq_disk, req->sector);
1564 all_stat_add(req->rq_disk, part, sectors[rw], 1569 all_stat_add(req->rq_disk, part, sectors[rw],
1565 nr_bytes >> 9, req->sector); 1570 nr_bytes >> 9, req->sector);
1571 rcu_read_unlock();
1566 } 1572 }
1567 1573
1568 total_bytes = bio_nbytes = 0; 1574 total_bytes = bio_nbytes = 0;
@@ -1746,7 +1752,11 @@ static void end_that_request_last(struct request *req, int error)
1746 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { 1752 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
1747 unsigned long duration = jiffies - req->start_time; 1753 unsigned long duration = jiffies - req->start_time;
1748 const int rw = rq_data_dir(req); 1754 const int rw = rq_data_dir(req);
1749 struct hd_struct *part = disk_map_sector(disk, req->sector); 1755 struct hd_struct *part;
1756
1757 rcu_read_lock();
1758
1759 part = disk_map_sector_rcu(disk, req->sector);
1750 1760
1751 __all_stat_inc(disk, part, ios[rw], req->sector); 1761 __all_stat_inc(disk, part, ios[rw], req->sector);
1752 __all_stat_add(disk, part, ticks[rw], duration, req->sector); 1762 __all_stat_add(disk, part, ticks[rw], duration, req->sector);
@@ -1756,6 +1766,8 @@ static void end_that_request_last(struct request *req, int error)
1756 part_round_stats(part); 1766 part_round_stats(part);
1757 part->in_flight--; 1767 part->in_flight--;
1758 } 1768 }
1769
1770 rcu_read_unlock();
1759 } 1771 }
1760 1772
1761 if (req->end_io) 1773 if (req->end_io)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 9b17da698d7c..eb2a3ca58303 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -387,14 +387,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
387 elv_merge_requests(q, req, next); 387 elv_merge_requests(q, req, next);
388 388
389 if (req->rq_disk) { 389 if (req->rq_disk) {
390 struct hd_struct *part = 390 struct hd_struct *part;
391 disk_map_sector(req->rq_disk, req->sector); 391
392 rcu_read_lock();
393
394 part = disk_map_sector_rcu(req->rq_disk, req->sector);
392 disk_round_stats(req->rq_disk); 395 disk_round_stats(req->rq_disk);
393 req->rq_disk->in_flight--; 396 req->rq_disk->in_flight--;
394 if (part) { 397 if (part) {
395 part_round_stats(part); 398 part_round_stats(part);
396 part->in_flight--; 399 part->in_flight--;
397 } 400 }
401
402 rcu_read_unlock();
398 } 403 }
399 404
400 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 405 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
diff --git a/block/genhd.c b/block/genhd.c
index fa32d09fda24..b431d6543942 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -26,6 +26,158 @@ struct kobject *block_depr;
26 26
27static struct device_type disk_type; 27static struct device_type disk_type;
28 28
29/**
30 * disk_get_part - get partition
31 * @disk: disk to look partition from
32 * @partno: partition number
33 *
34 * Look for partition @partno from @disk. If found, increment
35 * reference count and return it.
36 *
37 * CONTEXT:
38 * Don't care.
39 *
40 * RETURNS:
41 * Pointer to the found partition on success, NULL if not found.
42 */
43struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
44{
45 struct hd_struct *part;
46
47 if (unlikely(partno < 1 || partno > disk_max_parts(disk)))
48 return NULL;
49 rcu_read_lock();
50 part = rcu_dereference(disk->__part[partno - 1]);
51 if (part)
52 get_device(&part->dev);
53 rcu_read_unlock();
54
55 return part;
56}
57EXPORT_SYMBOL_GPL(disk_get_part);
58
59/**
60 * disk_part_iter_init - initialize partition iterator
61 * @piter: iterator to initialize
62 * @disk: disk to iterate over
63 * @flags: DISK_PITER_* flags
64 *
65 * Initialize @piter so that it iterates over partitions of @disk.
66 *
67 * CONTEXT:
68 * Don't care.
69 */
70void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
71 unsigned int flags)
72{
73 piter->disk = disk;
74 piter->part = NULL;
75
76 if (flags & DISK_PITER_REVERSE)
77 piter->idx = disk_max_parts(piter->disk) - 1;
78 else
79 piter->idx = 0;
80
81 piter->flags = flags;
82}
83EXPORT_SYMBOL_GPL(disk_part_iter_init);
84
85/**
86 * disk_part_iter_next - proceed iterator to the next partition and return it
87 * @piter: iterator of interest
88 *
89 * Proceed @piter to the next partition and return it.
90 *
91 * CONTEXT:
92 * Don't care.
93 */
94struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
95{
96 int inc, end;
97
98 /* put the last partition */
99 disk_put_part(piter->part);
100 piter->part = NULL;
101
102 rcu_read_lock();
103
104 /* determine iteration parameters */
105 if (piter->flags & DISK_PITER_REVERSE) {
106 inc = -1;
107 end = -1;
108 } else {
109 inc = 1;
110 end = disk_max_parts(piter->disk);
111 }
112
113 /* iterate to the next partition */
114 for (; piter->idx != end; piter->idx += inc) {
115 struct hd_struct *part;
116
117 part = rcu_dereference(piter->disk->__part[piter->idx]);
118 if (!part)
119 continue;
120 if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
121 continue;
122
123 get_device(&part->dev);
124 piter->part = part;
125 piter->idx += inc;
126 break;
127 }
128
129 rcu_read_unlock();
130
131 return piter->part;
132}
133EXPORT_SYMBOL_GPL(disk_part_iter_next);
134
135/**
136 * disk_part_iter_exit - finish up partition iteration
137 * @piter: iter of interest
138 *
139 * Called when iteration is over. Cleans up @piter.
140 *
141 * CONTEXT:
142 * Don't care.
143 */
144void disk_part_iter_exit(struct disk_part_iter *piter)
145{
146 disk_put_part(piter->part);
147 piter->part = NULL;
148}
149EXPORT_SYMBOL_GPL(disk_part_iter_exit);
150
151/**
152 * disk_map_sector_rcu - map sector to partition
153 * @disk: gendisk of interest
154 * @sector: sector to map
155 *
156 * Find out which partition @sector maps to on @disk. This is
157 * primarily used for stats accounting.
158 *
159 * CONTEXT:
160 * RCU read locked. The returned partition pointer is valid only
161 * while preemption is disabled.
162 *
163 * RETURNS:
164 * Found partition on success, NULL if there's no matching partition.
165 */
166struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
167{
168 int i;
169
170 for (i = 0; i < disk_max_parts(disk); i++) {
171 struct hd_struct *part = rcu_dereference(disk->__part[i]);
172
173 if (part && part->start_sect <= sector &&
174 sector < part->start_sect + part->nr_sects)
175 return part;
176 }
177 return NULL;
178}
179EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
180
29/* 181/*
30 * Can be deleted altogether. Later. 182 * Can be deleted altogether. Later.
31 * 183 *
@@ -245,10 +397,12 @@ extern struct block_device *bdget_disk(struct gendisk *disk, int partno)
245 if (partno == 0) 397 if (partno == 0)
246 devt = disk_devt(disk); 398 devt = disk_devt(disk);
247 else { 399 else {
248 struct hd_struct *part = disk->part[partno - 1]; 400 struct hd_struct *part;
249 401
402 part = disk_get_part(disk, partno);
250 if (part && part->nr_sects) 403 if (part && part->nr_sects)
251 devt = part_devt(part); 404 devt = part_devt(part);
405 disk_put_part(part);
252 } 406 }
253 407
254 if (likely(devt != MKDEV(0, 0))) 408 if (likely(devt != MKDEV(0, 0)))
@@ -270,8 +424,9 @@ void __init printk_all_partitions(void)
270 class_dev_iter_init(&iter, &block_class, NULL, &disk_type); 424 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
271 while ((dev = class_dev_iter_next(&iter))) { 425 while ((dev = class_dev_iter_next(&iter))) {
272 struct gendisk *disk = dev_to_disk(dev); 426 struct gendisk *disk = dev_to_disk(dev);
427 struct disk_part_iter piter;
428 struct hd_struct *part;
273 char buf[BDEVNAME_SIZE]; 429 char buf[BDEVNAME_SIZE];
274 int n;
275 430
276 /* 431 /*
277 * Don't show empty devices or things that have been 432 * Don't show empty devices or things that have been
@@ -298,16 +453,13 @@ void __init printk_all_partitions(void)
298 printk(" (driver?)\n"); 453 printk(" (driver?)\n");
299 454
300 /* now show the partitions */ 455 /* now show the partitions */
301 for (n = 0; n < disk_max_parts(disk); ++n) { 456 disk_part_iter_init(&piter, disk, 0);
302 struct hd_struct *part = disk->part[n]; 457 while ((part = disk_part_iter_next(&piter)))
303
304 if (!part || !part->nr_sects)
305 continue;
306 printk(" %02x%02x %10llu %s\n", 458 printk(" %02x%02x %10llu %s\n",
307 MAJOR(part_devt(part)), MINOR(part_devt(part)), 459 MAJOR(part_devt(part)), MINOR(part_devt(part)),
308 (unsigned long long)part->nr_sects >> 1, 460 (unsigned long long)part->nr_sects >> 1,
309 disk_name(disk, part->partno, buf)); 461 disk_name(disk, part->partno, buf));
310 } 462 disk_part_iter_exit(&piter);
311 } 463 }
312 class_dev_iter_exit(&iter); 464 class_dev_iter_exit(&iter);
313} 465}
@@ -371,7 +523,8 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
371static int show_partition(struct seq_file *seqf, void *v) 523static int show_partition(struct seq_file *seqf, void *v)
372{ 524{
373 struct gendisk *sgp = v; 525 struct gendisk *sgp = v;
374 int n; 526 struct disk_part_iter piter;
527 struct hd_struct *part;
375 char buf[BDEVNAME_SIZE]; 528 char buf[BDEVNAME_SIZE];
376 529
377 /* Don't show non-partitionable removeable devices or empty devices */ 530 /* Don't show non-partitionable removeable devices or empty devices */
@@ -386,17 +539,14 @@ static int show_partition(struct seq_file *seqf, void *v)
386 MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)), 539 MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)),
387 (unsigned long long)get_capacity(sgp) >> 1, 540 (unsigned long long)get_capacity(sgp) >> 1,
388 disk_name(sgp, 0, buf)); 541 disk_name(sgp, 0, buf));
389 for (n = 0; n < disk_max_parts(sgp); n++) { 542
390 struct hd_struct *part = sgp->part[n]; 543 disk_part_iter_init(&piter, sgp, 0);
391 if (!part) 544 while ((part = disk_part_iter_next(&piter)))
392 continue;
393 if (part->nr_sects == 0)
394 continue;
395 seq_printf(seqf, "%4d %4d %10llu %s\n", 545 seq_printf(seqf, "%4d %4d %10llu %s\n",
396 MAJOR(part_devt(part)), MINOR(part_devt(part)), 546 MAJOR(part_devt(part)), MINOR(part_devt(part)),
397 (unsigned long long)part->nr_sects >> 1, 547 (unsigned long long)part->nr_sects >> 1,
398 disk_name(sgp, part->partno, buf)); 548 disk_name(sgp, part->partno, buf));
399 } 549 disk_part_iter_exit(&piter);
400 550
401 return 0; 551 return 0;
402} 552}
@@ -571,7 +721,7 @@ static void disk_release(struct device *dev)
571 struct gendisk *disk = dev_to_disk(dev); 721 struct gendisk *disk = dev_to_disk(dev);
572 722
573 kfree(disk->random); 723 kfree(disk->random);
574 kfree(disk->part); 724 kfree(disk->__part);
575 free_disk_stats(disk); 725 free_disk_stats(disk);
576 kfree(disk); 726 kfree(disk);
577} 727}
@@ -596,8 +746,9 @@ static struct device_type disk_type = {
596static int diskstats_show(struct seq_file *seqf, void *v) 746static int diskstats_show(struct seq_file *seqf, void *v)
597{ 747{
598 struct gendisk *gp = v; 748 struct gendisk *gp = v;
749 struct disk_part_iter piter;
750 struct hd_struct *hd;
599 char buf[BDEVNAME_SIZE]; 751 char buf[BDEVNAME_SIZE];
600 int n;
601 752
602 /* 753 /*
603 if (&gp->dev.kobj.entry == block_class.devices.next) 754 if (&gp->dev.kobj.entry == block_class.devices.next)
@@ -624,12 +775,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
624 jiffies_to_msecs(disk_stat_read(gp, time_in_queue))); 775 jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
625 776
626 /* now show all non-0 size partitions of it */ 777 /* now show all non-0 size partitions of it */
627 for (n = 0; n < disk_max_parts(gp); n++) { 778 disk_part_iter_init(&piter, gp, 0);
628 struct hd_struct *hd = gp->part[n]; 779 while ((hd = disk_part_iter_next(&piter))) {
629
630 if (!hd || !hd->nr_sects)
631 continue;
632
633 preempt_disable(); 780 preempt_disable();
634 part_round_stats(hd); 781 part_round_stats(hd);
635 preempt_enable(); 782 preempt_enable();
@@ -650,6 +797,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
650 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 797 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
651 ); 798 );
652 } 799 }
800 disk_part_iter_exit(&piter);
653 801
654 return 0; 802 return 0;
655} 803}
@@ -703,12 +851,16 @@ dev_t blk_lookup_devt(const char *name, int partno)
703 if (partno == 0) 851 if (partno == 0)
704 devt = disk_devt(disk); 852 devt = disk_devt(disk);
705 else { 853 else {
706 struct hd_struct *part = disk->part[partno - 1]; 854 struct hd_struct *part;
707 855
708 if (!part || !part->nr_sects) 856 part = disk_get_part(disk, partno);
857 if (!part || !part->nr_sects) {
858 disk_put_part(part);
709 continue; 859 continue;
860 }
710 861
711 devt = part_devt(part); 862 devt = part_devt(part);
863 disk_put_part(part);
712 } 864 }
713 break; 865 break;
714 } 866 }
@@ -735,9 +887,9 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
735 } 887 }
736 if (minors > 1) { 888 if (minors > 1) {
737 int size = (minors - 1) * sizeof(struct hd_struct *); 889 int size = (minors - 1) * sizeof(struct hd_struct *);
738 disk->part = kmalloc_node(size, 890 disk->__part = kmalloc_node(size,
739 GFP_KERNEL | __GFP_ZERO, node_id); 891 GFP_KERNEL | __GFP_ZERO, node_id);
740 if (!disk->part) { 892 if (!disk->__part) {
741 free_disk_stats(disk); 893 free_disk_stats(disk);
742 kfree(disk); 894 kfree(disk);
743 return NULL; 895 return NULL;
@@ -798,10 +950,14 @@ EXPORT_SYMBOL(set_device_ro);
798 950
799void set_disk_ro(struct gendisk *disk, int flag) 951void set_disk_ro(struct gendisk *disk, int flag)
800{ 952{
801 int i; 953 struct disk_part_iter piter;
954 struct hd_struct *part;
955
802 disk->policy = flag; 956 disk->policy = flag;
803 for (i = 0; i < disk_max_parts(disk); i++) 957 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
804 if (disk->part[i]) disk->part[i]->policy = flag; 958 while ((part = disk_part_iter_next(&piter)))
959 part->policy = flag;
960 disk_part_iter_exit(&piter);
805} 961}
806 962
807EXPORT_SYMBOL(set_disk_ro); 963EXPORT_SYMBOL(set_disk_ro);
diff --git a/block/ioctl.c b/block/ioctl.c
index 403f7d7e0c28..a5f672ad55f6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -12,11 +12,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
12{ 12{
13 struct block_device *bdevp; 13 struct block_device *bdevp;
14 struct gendisk *disk; 14 struct gendisk *disk;
15 struct hd_struct *part;
15 struct blkpg_ioctl_arg a; 16 struct blkpg_ioctl_arg a;
16 struct blkpg_partition p; 17 struct blkpg_partition p;
18 struct disk_part_iter piter;
17 long long start, length; 19 long long start, length;
18 int partno; 20 int partno;
19 int i;
20 int err; 21 int err;
21 22
22 if (!capable(CAP_SYS_ADMIN)) 23 if (!capable(CAP_SYS_ADMIN))
@@ -47,28 +48,33 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
47 mutex_lock(&bdev->bd_mutex); 48 mutex_lock(&bdev->bd_mutex);
48 49
49 /* overlap? */ 50 /* overlap? */
50 for (i = 0; i < disk_max_parts(disk); i++) { 51 disk_part_iter_init(&piter, disk,
51 struct hd_struct *s = disk->part[i]; 52 DISK_PITER_INCL_EMPTY);
52 53 while ((part = disk_part_iter_next(&piter))) {
53 if (!s) 54 if (!(start + length <= part->start_sect ||
54 continue; 55 start >= part->start_sect + part->nr_sects)) {
55 if (!(start+length <= s->start_sect || 56 disk_part_iter_exit(&piter);
56 start >= s->start_sect + s->nr_sects)) {
57 mutex_unlock(&bdev->bd_mutex); 57 mutex_unlock(&bdev->bd_mutex);
58 return -EBUSY; 58 return -EBUSY;
59 } 59 }
60 } 60 }
61 disk_part_iter_exit(&piter);
62
61 /* all seems OK */ 63 /* all seems OK */
62 err = add_partition(disk, partno, start, length, 64 err = add_partition(disk, partno, start, length,
63 ADDPART_FLAG_NONE); 65 ADDPART_FLAG_NONE);
64 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
65 return err; 67 return err;
66 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
67 if (!disk->part[partno - 1]) 69 part = disk_get_part(disk, partno);
70 if (!part)
68 return -ENXIO; 71 return -ENXIO;
69 bdevp = bdget_disk(disk, partno); 72
73 bdevp = bdget(part_devt(part));
74 disk_put_part(part);
70 if (!bdevp) 75 if (!bdevp)
71 return -ENOMEM; 76 return -ENOMEM;
77
72 mutex_lock(&bdevp->bd_mutex); 78 mutex_lock(&bdevp->bd_mutex);
73 if (bdevp->bd_openers) { 79 if (bdevp->bd_openers) {
74 mutex_unlock(&bdevp->bd_mutex); 80 mutex_unlock(&bdevp->bd_mutex);