diff options
author | Tejun Heo <tj@kernel.org> | 2010-12-08 14:57:37 -0500 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2010-12-16 11:53:38 -0500 |
commit | 77ea887e433ad8389d416826936c110fa7910f80 (patch) | |
tree | ac9d32aabcebf5a465acae2066b12c9335b5ca6f /fs/block_dev.c | |
parent | d2bf1b6723ed0eab378363649d15b7893bf14e91 (diff) |
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r-- | fs/block_dev.c | 41 |
1 files changed, 34 insertions, 7 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index c1c1b8c3fb99..6017389711ee 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -948,10 +948,11 @@ int check_disk_change(struct block_device *bdev) | |||
948 | { | 948 | { |
949 | struct gendisk *disk = bdev->bd_disk; | 949 | struct gendisk *disk = bdev->bd_disk; |
950 | const struct block_device_operations *bdops = disk->fops; | 950 | const struct block_device_operations *bdops = disk->fops; |
951 | unsigned int events; | ||
951 | 952 | ||
952 | if (!bdops->media_changed) | 953 | events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | |
953 | return 0; | 954 | DISK_EVENT_EJECT_REQUEST); |
954 | if (!bdops->media_changed(bdev->bd_disk)) | 955 | if (!(events & DISK_EVENT_MEDIA_CHANGE)) |
955 | return 0; | 956 | return 0; |
956 | 957 | ||
957 | flush_disk(bdev); | 958 | flush_disk(bdev); |
@@ -1158,9 +1159,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) | |||
1158 | 1159 | ||
1159 | if (whole) { | 1160 | if (whole) { |
1160 | /* finish claiming */ | 1161 | /* finish claiming */ |
1162 | mutex_lock(&bdev->bd_mutex); | ||
1161 | spin_lock(&bdev_lock); | 1163 | spin_lock(&bdev_lock); |
1162 | 1164 | ||
1163 | if (res == 0) { | 1165 | if (!res) { |
1164 | BUG_ON(!bd_may_claim(bdev, whole, holder)); | 1166 | BUG_ON(!bd_may_claim(bdev, whole, holder)); |
1165 | /* | 1167 | /* |
1166 | * Note that for a whole device bd_holders | 1168 | * Note that for a whole device bd_holders |
@@ -1180,6 +1182,20 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) | |||
1180 | wake_up_bit(&whole->bd_claiming, 0); | 1182 | wake_up_bit(&whole->bd_claiming, 0); |
1181 | 1183 | ||
1182 | spin_unlock(&bdev_lock); | 1184 | spin_unlock(&bdev_lock); |
1185 | |||
1186 | /* | ||
1187 | * Block event polling for write claims. Any write | ||
1188 | * holder makes the write_holder state stick until all | ||
1189 | * are released. This is good enough and tracking | ||
1190 | * individual writeable reference is too fragile given | ||
1191 | * the way @mode is used in blkdev_get/put(). | ||
1192 | */ | ||
1193 | if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { | ||
1194 | bdev->bd_write_holder = true; | ||
1195 | disk_block_events(bdev->bd_disk); | ||
1196 | } | ||
1197 | |||
1198 | mutex_unlock(&bdev->bd_mutex); | ||
1183 | bdput(whole); | 1199 | bdput(whole); |
1184 | } | 1200 | } |
1185 | 1201 | ||
@@ -1353,12 +1369,23 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) | |||
1353 | 1369 | ||
1354 | spin_unlock(&bdev_lock); | 1370 | spin_unlock(&bdev_lock); |
1355 | 1371 | ||
1356 | /* if this was the last claim, holder link should go too */ | 1372 | /* |
1357 | if (bdev_free) | 1373 | * If this was the last claim, remove holder link and |
1374 | * unblock evpoll if it was a write holder. | ||
1375 | */ | ||
1376 | if (bdev_free) { | ||
1358 | bd_unlink_disk_holder(bdev); | 1377 | bd_unlink_disk_holder(bdev); |
1378 | if (bdev->bd_write_holder) { | ||
1379 | disk_unblock_events(bdev->bd_disk); | ||
1380 | bdev->bd_write_holder = false; | ||
1381 | } else | ||
1382 | disk_check_events(bdev->bd_disk); | ||
1383 | } | ||
1359 | 1384 | ||
1360 | mutex_unlock(&bdev->bd_mutex); | 1385 | mutex_unlock(&bdev->bd_mutex); |
1361 | } | 1386 | } else |
1387 | disk_check_events(bdev->bd_disk); | ||
1388 | |||
1362 | return __blkdev_put(bdev, mode, 0); | 1389 | return __blkdev_put(bdev, mode, 0); |
1363 | } | 1390 | } |
1364 | EXPORT_SYMBOL(blkdev_put); | 1391 | EXPORT_SYMBOL(blkdev_put); |