aboutsummaryrefslogtreecommitdiffstats
path: root/fs/block_dev.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-12-08 14:57:37 -0500
committerJens Axboe <jaxboe@fusionio.com>2010-12-16 11:53:38 -0500
commit77ea887e433ad8389d416826936c110fa7910f80 (patch)
treeac9d32aabcebf5a465acae2066b12c9335b5ca6f /fs/block_dev.c
parentd2bf1b6723ed0eab378363649d15b7893bf14e91 (diff)
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r--fs/block_dev.c41
1 files changed, 34 insertions, 7 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c1c1b8c3fb99..6017389711ee 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -948,10 +948,11 @@ int check_disk_change(struct block_device *bdev)
948{ 948{
949 struct gendisk *disk = bdev->bd_disk; 949 struct gendisk *disk = bdev->bd_disk;
950 const struct block_device_operations *bdops = disk->fops; 950 const struct block_device_operations *bdops = disk->fops;
951 unsigned int events;
951 952
952 if (!bdops->media_changed) 953 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
953 return 0; 954 DISK_EVENT_EJECT_REQUEST);
954 if (!bdops->media_changed(bdev->bd_disk)) 955 if (!(events & DISK_EVENT_MEDIA_CHANGE))
955 return 0; 956 return 0;
956 957
957 flush_disk(bdev); 958 flush_disk(bdev);
@@ -1158,9 +1159,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1158 1159
1159 if (whole) { 1160 if (whole) {
1160 /* finish claiming */ 1161 /* finish claiming */
1162 mutex_lock(&bdev->bd_mutex);
1161 spin_lock(&bdev_lock); 1163 spin_lock(&bdev_lock);
1162 1164
1163 if (res == 0) { 1165 if (!res) {
1164 BUG_ON(!bd_may_claim(bdev, whole, holder)); 1166 BUG_ON(!bd_may_claim(bdev, whole, holder));
1165 /* 1167 /*
1166 * Note that for a whole device bd_holders 1168 * Note that for a whole device bd_holders
@@ -1180,6 +1182,20 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1180 wake_up_bit(&whole->bd_claiming, 0); 1182 wake_up_bit(&whole->bd_claiming, 0);
1181 1183
1182 spin_unlock(&bdev_lock); 1184 spin_unlock(&bdev_lock);
1185
1186 /*
1187 * Block event polling for write claims. Any write
1188 * holder makes the write_holder state stick until all
1189 * are released. This is good enough and tracking
1190 * individual writeable reference is too fragile given
1191 * the way @mode is used in blkdev_get/put().
1192 */
1193 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
1194 bdev->bd_write_holder = true;
1195 disk_block_events(bdev->bd_disk);
1196 }
1197
1198 mutex_unlock(&bdev->bd_mutex);
1183 bdput(whole); 1199 bdput(whole);
1184 } 1200 }
1185 1201
@@ -1353,12 +1369,23 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
1353 1369
1354 spin_unlock(&bdev_lock); 1370 spin_unlock(&bdev_lock);
1355 1371
1356 /* if this was the last claim, holder link should go too */ 1372 /*
1357 if (bdev_free) 1373 * If this was the last claim, remove holder link and
1374 * unblock evpoll if it was a write holder.
1375 */
1376 if (bdev_free) {
1358 bd_unlink_disk_holder(bdev); 1377 bd_unlink_disk_holder(bdev);
1378 if (bdev->bd_write_holder) {
1379 disk_unblock_events(bdev->bd_disk);
1380 bdev->bd_write_holder = false;
1381 } else
1382 disk_check_events(bdev->bd_disk);
1383 }
1359 1384
1360 mutex_unlock(&bdev->bd_mutex); 1385 mutex_unlock(&bdev->bd_mutex);
1361 } 1386 } else
1387 disk_check_events(bdev->bd_disk);
1388
1362 return __blkdev_put(bdev, mode, 0); 1389 return __blkdev_put(bdev, mode, 0);
1363} 1390}
1364EXPORT_SYMBOL(blkdev_put); 1391EXPORT_SYMBOL(blkdev_put);