diff options
| author | Vivek Goyal <vgoyal@redhat.com> | 2012-08-01 06:24:18 -0400 |
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2012-08-01 06:24:18 -0400 |
| commit | c83f6bf98dc1f1a194118b3830706cebbebda8c4 (patch) | |
| tree | ea8fbd925584f784164617964a9f025bda16ed15 | |
| parent | 4638a83e8615de9c16c39dfed234951d0f468cf1 (diff) | |
block: add partition resize function to blkpg ioctl
Add a new operation code (BLKPG_RESIZE_PARTITION) to the BLKPG ioctl that
allows altering the size of an existing partition, even if it is currently
in use.
This patch converts hd_struct->nr_sects into sequence counter because
One might extend a partition while IO is happening to it and update of
nr_sects can be non-atomic on 32bit machines with 64bit sector_t. This
can lead to issues like reading inconsistent size of a partition. Sequence
counter have been used so that readers don't have to take bdev mutex lock
as we call sector_in_part() very frequently.
Now all the access to hd_struct->nr_sects should happen using sequence
counter read/update helper functions part_nr_sects_read/part_nr_sects_write.
There is one exception though, set_capacity()/get_capacity(). I think
theoritically race should exist there too but this patch does not
modify set_capacity()/get_capacity() due to sheer number of call sites
and I am afraid that change might break something. I have left that as a
TODO item. We can handle it later if need be. This patch does not introduce
any new races as such w.r.t set_capacity()/get_capacity().
v2: Add CONFIG_LBDAF test to UP preempt case as suggested by Phillip.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Phillip Susi <psusi@ubuntu.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
| -rw-r--r-- | block/genhd.c | 20 | ||||
| -rw-r--r-- | block/ioctl.c | 59 | ||||
| -rw-r--r-- | block/partition-generic.c | 4 | ||||
| -rw-r--r-- | include/linux/blkpg.h | 1 | ||||
| -rw-r--r-- | include/linux/genhd.h | 57 |
5 files changed, 132 insertions, 9 deletions
diff --git a/block/genhd.c b/block/genhd.c index 9cf5583c90ff..cac7366957c3 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
| @@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) | |||
| 154 | part = rcu_dereference(ptbl->part[piter->idx]); | 154 | part = rcu_dereference(ptbl->part[piter->idx]); |
| 155 | if (!part) | 155 | if (!part) |
| 156 | continue; | 156 | continue; |
| 157 | if (!part->nr_sects && | 157 | if (!part_nr_sects_read(part) && |
| 158 | !(piter->flags & DISK_PITER_INCL_EMPTY) && | 158 | !(piter->flags & DISK_PITER_INCL_EMPTY) && |
| 159 | !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && | 159 | !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && |
| 160 | piter->idx == 0)) | 160 | piter->idx == 0)) |
| @@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); | |||
| 191 | static inline int sector_in_part(struct hd_struct *part, sector_t sector) | 191 | static inline int sector_in_part(struct hd_struct *part, sector_t sector) |
| 192 | { | 192 | { |
| 193 | return part->start_sect <= sector && | 193 | return part->start_sect <= sector && |
| 194 | sector < part->start_sect + part->nr_sects; | 194 | sector < part->start_sect + part_nr_sects_read(part); |
| 195 | } | 195 | } |
| 196 | 196 | ||
| 197 | /** | 197 | /** |
| @@ -769,8 +769,8 @@ void __init printk_all_partitions(void) | |||
| 769 | 769 | ||
| 770 | printk("%s%s %10llu %s %s", is_part0 ? "" : " ", | 770 | printk("%s%s %10llu %s %s", is_part0 ? "" : " ", |
| 771 | bdevt_str(part_devt(part), devt_buf), | 771 | bdevt_str(part_devt(part), devt_buf), |
| 772 | (unsigned long long)part->nr_sects >> 1, | 772 | (unsigned long long)part_nr_sects_read(part) >> 1 |
| 773 | disk_name(disk, part->partno, name_buf), | 773 | , disk_name(disk, part->partno, name_buf), |
| 774 | uuid_buf); | 774 | uuid_buf); |
| 775 | if (is_part0) { | 775 | if (is_part0) { |
| 776 | if (disk->driverfs_dev != NULL && | 776 | if (disk->driverfs_dev != NULL && |
| @@ -862,7 +862,7 @@ static int show_partition(struct seq_file *seqf, void *v) | |||
| 862 | while ((part = disk_part_iter_next(&piter))) | 862 | while ((part = disk_part_iter_next(&piter))) |
| 863 | seq_printf(seqf, "%4d %7d %10llu %s\n", | 863 | seq_printf(seqf, "%4d %7d %10llu %s\n", |
| 864 | MAJOR(part_devt(part)), MINOR(part_devt(part)), | 864 | MAJOR(part_devt(part)), MINOR(part_devt(part)), |
| 865 | (unsigned long long)part->nr_sects >> 1, | 865 | (unsigned long long)part_nr_sects_read(part) >> 1, |
| 866 | disk_name(sgp, part->partno, buf)); | 866 | disk_name(sgp, part->partno, buf)); |
| 867 | disk_part_iter_exit(&piter); | 867 | disk_part_iter_exit(&piter); |
| 868 | 868 | ||
| @@ -1268,6 +1268,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id) | |||
| 1268 | } | 1268 | } |
| 1269 | disk->part_tbl->part[0] = &disk->part0; | 1269 | disk->part_tbl->part[0] = &disk->part0; |
| 1270 | 1270 | ||
| 1271 | /* | ||
| 1272 | * set_capacity() and get_capacity() currently don't use | ||
| 1273 | * seqcounter to read/update the part0->nr_sects. Still init | ||
| 1274 | * the counter as we can read the sectors in IO submission | ||
| 1275 | * patch using seqence counters. | ||
| 1276 | * | ||
| 1277 | * TODO: Ideally set_capacity() and get_capacity() should be | ||
| 1278 | * converted to make use of bd_mutex and sequence counters. | ||
| 1279 | */ | ||
| 1280 | seqcount_init(&disk->part0.nr_sects_seq); | ||
| 1271 | hd_ref_init(&disk->part0); | 1281 | hd_ref_init(&disk->part0); |
| 1272 | 1282 | ||
| 1273 | disk->minors = minors; | 1283 | disk->minors = minors; |
diff --git a/block/ioctl.c b/block/ioctl.c index ba15b2dbfb98..4476e0e85d16 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
| @@ -13,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user | |||
| 13 | { | 13 | { |
| 14 | struct block_device *bdevp; | 14 | struct block_device *bdevp; |
| 15 | struct gendisk *disk; | 15 | struct gendisk *disk; |
| 16 | struct hd_struct *part; | 16 | struct hd_struct *part, *lpart; |
| 17 | struct blkpg_ioctl_arg a; | 17 | struct blkpg_ioctl_arg a; |
| 18 | struct blkpg_partition p; | 18 | struct blkpg_partition p; |
| 19 | struct disk_part_iter piter; | 19 | struct disk_part_iter piter; |
| @@ -36,8 +36,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user | |||
| 36 | case BLKPG_ADD_PARTITION: | 36 | case BLKPG_ADD_PARTITION: |
| 37 | start = p.start >> 9; | 37 | start = p.start >> 9; |
| 38 | length = p.length >> 9; | 38 | length = p.length >> 9; |
| 39 | /* check for fit in a hd_struct */ | 39 | /* check for fit in a hd_struct */ |
| 40 | if (sizeof(sector_t) == sizeof(long) && | 40 | if (sizeof(sector_t) == sizeof(long) && |
| 41 | sizeof(long long) > sizeof(long)) { | 41 | sizeof(long long) > sizeof(long)) { |
| 42 | long pstart = start, plength = length; | 42 | long pstart = start, plength = length; |
| 43 | if (pstart != start || plength != length | 43 | if (pstart != start || plength != length |
| @@ -92,6 +92,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user | |||
| 92 | bdput(bdevp); | 92 | bdput(bdevp); |
| 93 | 93 | ||
| 94 | return 0; | 94 | return 0; |
| 95 | case BLKPG_RESIZE_PARTITION: | ||
| 96 | start = p.start >> 9; | ||
| 97 | /* new length of partition in bytes */ | ||
| 98 | length = p.length >> 9; | ||
| 99 | /* check for fit in a hd_struct */ | ||
| 100 | if (sizeof(sector_t) == sizeof(long) && | ||
| 101 | sizeof(long long) > sizeof(long)) { | ||
| 102 | long pstart = start, plength = length; | ||
| 103 | if (pstart != start || plength != length | ||
| 104 | || pstart < 0 || plength < 0) | ||
| 105 | return -EINVAL; | ||
| 106 | } | ||
| 107 | part = disk_get_part(disk, partno); | ||
| 108 | if (!part) | ||
| 109 | return -ENXIO; | ||
| 110 | bdevp = bdget(part_devt(part)); | ||
| 111 | if (!bdevp) { | ||
| 112 | disk_put_part(part); | ||
| 113 | return -ENOMEM; | ||
| 114 | } | ||
| 115 | mutex_lock(&bdevp->bd_mutex); | ||
| 116 | mutex_lock_nested(&bdev->bd_mutex, 1); | ||
| 117 | if (start != part->start_sect) { | ||
| 118 | mutex_unlock(&bdevp->bd_mutex); | ||
| 119 | mutex_unlock(&bdev->bd_mutex); | ||
| 120 | bdput(bdevp); | ||
| 121 | disk_put_part(part); | ||
| 122 | return -EINVAL; | ||
| 123 | } | ||
| 124 | /* overlap? */ | ||
| 125 | disk_part_iter_init(&piter, disk, | ||
| 126 | DISK_PITER_INCL_EMPTY); | ||
| 127 | while ((lpart = disk_part_iter_next(&piter))) { | ||
| 128 | if (lpart->partno != partno && | ||
| 129 | !(start + length <= lpart->start_sect || | ||
| 130 | start >= lpart->start_sect + lpart->nr_sects) | ||
| 131 | ) { | ||
| 132 | disk_part_iter_exit(&piter); | ||
| 133 | mutex_unlock(&bdevp->bd_mutex); | ||
| 134 | mutex_unlock(&bdev->bd_mutex); | ||
| 135 | bdput(bdevp); | ||
| 136 | disk_put_part(part); | ||
| 137 | return -EBUSY; | ||
| 138 | } | ||
| 139 | } | ||
| 140 | disk_part_iter_exit(&piter); | ||
| 141 | part_nr_sects_write(part, (sector_t)length); | ||
| 142 | i_size_write(bdevp->bd_inode, p.length); | ||
| 143 | mutex_unlock(&bdevp->bd_mutex); | ||
| 144 | mutex_unlock(&bdev->bd_mutex); | ||
| 145 | bdput(bdevp); | ||
| 146 | disk_put_part(part); | ||
| 147 | return 0; | ||
| 95 | default: | 148 | default: |
| 96 | return -EINVAL; | 149 | return -EINVAL; |
| 97 | } | 150 | } |
diff --git a/block/partition-generic.c b/block/partition-generic.c index 6df5d6928a44..f1d14519cc04 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c | |||
| @@ -84,7 +84,7 @@ ssize_t part_size_show(struct device *dev, | |||
| 84 | struct device_attribute *attr, char *buf) | 84 | struct device_attribute *attr, char *buf) |
| 85 | { | 85 | { |
| 86 | struct hd_struct *p = dev_to_part(dev); | 86 | struct hd_struct *p = dev_to_part(dev); |
| 87 | return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); | 87 | return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); |
| 88 | } | 88 | } |
| 89 | 89 | ||
| 90 | static ssize_t part_ro_show(struct device *dev, | 90 | static ssize_t part_ro_show(struct device *dev, |
| @@ -294,6 +294,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, | |||
| 294 | err = -ENOMEM; | 294 | err = -ENOMEM; |
| 295 | goto out_free; | 295 | goto out_free; |
| 296 | } | 296 | } |
| 297 | |||
| 298 | seqcount_init(&p->nr_sects_seq); | ||
| 297 | pdev = part_to_dev(p); | 299 | pdev = part_to_dev(p); |
| 298 | 300 | ||
| 299 | p->start_sect = start; | 301 | p->start_sect = start; |
diff --git a/include/linux/blkpg.h b/include/linux/blkpg.h index faf8a45af210..a8519446c111 100644 --- a/include/linux/blkpg.h +++ b/include/linux/blkpg.h | |||
| @@ -40,6 +40,7 @@ struct blkpg_ioctl_arg { | |||
| 40 | /* The subfunctions (for the op field) */ | 40 | /* The subfunctions (for the op field) */ |
| 41 | #define BLKPG_ADD_PARTITION 1 | 41 | #define BLKPG_ADD_PARTITION 1 |
| 42 | #define BLKPG_DEL_PARTITION 2 | 42 | #define BLKPG_DEL_PARTITION 2 |
| 43 | #define BLKPG_RESIZE_PARTITION 3 | ||
| 43 | 44 | ||
| 44 | /* Sizes of name fields. Unused at present. */ | 45 | /* Sizes of name fields. Unused at present. */ |
| 45 | #define BLKPG_DEVNAMELTH 64 | 46 | #define BLKPG_DEVNAMELTH 64 |
diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 017a7fb5a1fc..b88723b81b3d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h | |||
| @@ -98,7 +98,13 @@ struct partition_meta_info { | |||
| 98 | 98 | ||
| 99 | struct hd_struct { | 99 | struct hd_struct { |
| 100 | sector_t start_sect; | 100 | sector_t start_sect; |
| 101 | /* | ||
| 102 | * nr_sects is protected by sequence counter. One might extend a | ||
| 103 | * partition while IO is happening to it and update of nr_sects | ||
| 104 | * can be non-atomic on 32bit machines with 64bit sector_t. | ||
| 105 | */ | ||
| 101 | sector_t nr_sects; | 106 | sector_t nr_sects; |
| 107 | seqcount_t nr_sects_seq; | ||
| 102 | sector_t alignment_offset; | 108 | sector_t alignment_offset; |
| 103 | unsigned int discard_alignment; | 109 | unsigned int discard_alignment; |
| 104 | struct device __dev; | 110 | struct device __dev; |
| @@ -648,6 +654,57 @@ static inline void hd_struct_put(struct hd_struct *part) | |||
| 648 | __delete_partition(part); | 654 | __delete_partition(part); |
| 649 | } | 655 | } |
| 650 | 656 | ||
| 657 | /* | ||
| 658 | * Any access of part->nr_sects which is not protected by partition | ||
| 659 | * bd_mutex or gendisk bdev bd_mutex, should be done using this | ||
| 660 | * accessor function. | ||
| 661 | * | ||
| 662 | * Code written along the lines of i_size_read() and i_size_write(). | ||
| 663 | * CONFIG_PREEMPT case optimizes the case of UP kernel with preemption | ||
| 664 | * on. | ||
| 665 | */ | ||
| 666 | static inline sector_t part_nr_sects_read(struct hd_struct *part) | ||
| 667 | { | ||
| 668 | #if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) | ||
| 669 | sector_t nr_sects; | ||
| 670 | unsigned seq; | ||
| 671 | do { | ||
| 672 | seq = read_seqcount_begin(&part->nr_sects_seq); | ||
| 673 | nr_sects = part->nr_sects; | ||
| 674 | } while (read_seqcount_retry(&part->nr_sects_seq, seq)); | ||
| 675 | return nr_sects; | ||
| 676 | #elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) | ||
| 677 | sector_t nr_sects; | ||
| 678 | |||
| 679 | preempt_disable(); | ||
| 680 | nr_sects = part->nr_sects; | ||
| 681 | preempt_enable(); | ||
| 682 | return nr_sects; | ||
| 683 | #else | ||
| 684 | return part->nr_sects; | ||
| 685 | #endif | ||
| 686 | } | ||
| 687 | |||
| 688 | /* | ||
| 689 | * Should be called with mutex lock held (typically bd_mutex) of partition | ||
| 690 | * to provide mutual exlusion among writers otherwise seqcount might be | ||
| 691 | * left in wrong state leaving the readers spinning infinitely. | ||
| 692 | */ | ||
| 693 | static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) | ||
| 694 | { | ||
| 695 | #if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) | ||
| 696 | write_seqcount_begin(&part->nr_sects_seq); | ||
| 697 | part->nr_sects = size; | ||
| 698 | write_seqcount_end(&part->nr_sects_seq); | ||
| 699 | #elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) | ||
| 700 | preempt_disable(); | ||
| 701 | part->nr_sects = size; | ||
| 702 | preempt_enable(); | ||
| 703 | #else | ||
| 704 | part->nr_sects = size; | ||
| 705 | #endif | ||
| 706 | } | ||
| 707 | |||
| 651 | #else /* CONFIG_BLOCK */ | 708 | #else /* CONFIG_BLOCK */ |
| 652 | 709 | ||
| 653 | static inline void printk_all_partitions(void) { } | 710 | static inline void printk_all_partitions(void) { } |
