aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-12-08 14:57:37 -0500
committerJens Axboe <jaxboe@fusionio.com>2010-12-16 11:53:38 -0500
commit77ea887e433ad8389d416826936c110fa7910f80 (patch)
treeac9d32aabcebf5a465acae2066b12c9335b5ca6f /block
parentd2bf1b6723ed0eab378363649d15b7893bf14e91 (diff)
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'block')
-rw-r--r--block/genhd.c429
1 files changed, 429 insertions, 0 deletions
diff --git a/block/genhd.c b/block/genhd.c
index 2e5e4c0a1133..5465a824d489 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
18#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/log2.h>
21 22
22#include "blk.h" 23#include "blk.h"
23 24
@@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
35 36
36static struct device_type disk_type; 37static struct device_type disk_type;
37 38
39static void disk_add_events(struct gendisk *disk);
40static void disk_del_events(struct gendisk *disk);
41static void disk_release_events(struct gendisk *disk);
42
38/** 43/**
39 * disk_get_part - get partition 44 * disk_get_part - get partition
40 * @disk: disk to look partition from 45 * @disk: disk to look partition from
@@ -609,6 +614,8 @@ void add_disk(struct gendisk *disk)
609 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 614 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
610 "bdi"); 615 "bdi");
611 WARN_ON(retval); 616 WARN_ON(retval);
617
618 disk_add_events(disk);
612} 619}
613EXPORT_SYMBOL(add_disk); 620EXPORT_SYMBOL(add_disk);
614 621
@@ -617,6 +624,8 @@ void del_gendisk(struct gendisk *disk)
617 struct disk_part_iter piter; 624 struct disk_part_iter piter;
618 struct hd_struct *part; 625 struct hd_struct *part;
619 626
627 disk_del_events(disk);
628
620 /* invalidate stuff */ 629 /* invalidate stuff */
621 disk_part_iter_init(&piter, disk, 630 disk_part_iter_init(&piter, disk,
622 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); 631 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@@ -1089,6 +1098,7 @@ static void disk_release(struct device *dev)
1089{ 1098{
1090 struct gendisk *disk = dev_to_disk(dev); 1099 struct gendisk *disk = dev_to_disk(dev);
1091 1100
1101 disk_release_events(disk);
1092 kfree(disk->random); 1102 kfree(disk->random);
1093 disk_replace_part_tbl(disk, NULL); 1103 disk_replace_part_tbl(disk, NULL);
1094 free_part_stats(&disk->part0); 1104 free_part_stats(&disk->part0);
@@ -1350,3 +1360,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
1350} 1360}
1351 1361
1352EXPORT_SYMBOL(invalidate_partition); 1362EXPORT_SYMBOL(invalidate_partition);
1363
1364/*
1365 * Disk events - monitor disk events like media change and eject request.
1366 */
1367struct disk_events {
1368 struct list_head node; /* all disk_event's */
1369 struct gendisk *disk; /* the associated disk */
1370 spinlock_t lock;
1371
1372 int block; /* event blocking depth */
1373 unsigned int pending; /* events already sent out */
1374 unsigned int clearing; /* events being cleared */
1375
1376 long poll_msecs; /* interval, -1 for default */
1377 struct delayed_work dwork;
1378};
1379
1380static const char *disk_events_strs[] = {
1381 [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
1382 [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
1383};
1384
1385static char *disk_uevents[] = {
1386 [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
1387 [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
1388};
1389
1390/* list of all disk_events */
1391static DEFINE_MUTEX(disk_events_mutex);
1392static LIST_HEAD(disk_events);
1393
1394/* disable in-kernel polling by default */
1395static unsigned long disk_events_dfl_poll_msecs = 0;
1396
1397static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1398{
1399 struct disk_events *ev = disk->ev;
1400 long intv_msecs = 0;
1401
1402 /*
1403 * If device-specific poll interval is set, always use it. If
1404 * the default is being used, poll iff there are events which
1405 * can't be monitored asynchronously.
1406 */
1407 if (ev->poll_msecs >= 0)
1408 intv_msecs = ev->poll_msecs;
1409 else if (disk->events & ~disk->async_events)
1410 intv_msecs = disk_events_dfl_poll_msecs;
1411
1412 return msecs_to_jiffies(intv_msecs);
1413}
1414
1415static void __disk_block_events(struct gendisk *disk, bool sync)
1416{
1417 struct disk_events *ev = disk->ev;
1418 unsigned long flags;
1419 bool cancel;
1420
1421 spin_lock_irqsave(&ev->lock, flags);
1422 cancel = !ev->block++;
1423 spin_unlock_irqrestore(&ev->lock, flags);
1424
1425 if (cancel) {
1426 if (sync)
1427 cancel_delayed_work_sync(&disk->ev->dwork);
1428 else
1429 cancel_delayed_work(&disk->ev->dwork);
1430 }
1431}
1432
1433static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1434{
1435 struct disk_events *ev = disk->ev;
1436 unsigned long intv;
1437 unsigned long flags;
1438
1439 spin_lock_irqsave(&ev->lock, flags);
1440
1441 if (WARN_ON_ONCE(ev->block <= 0))
1442 goto out_unlock;
1443
1444 if (--ev->block)
1445 goto out_unlock;
1446
1447 /*
1448 * Not exactly a latency critical operation, set poll timer
1449 * slack to 25% and kick event check.
1450 */
1451 intv = disk_events_poll_jiffies(disk);
1452 set_timer_slack(&ev->dwork.timer, intv / 4);
1453 if (check_now)
1454 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1455 else if (intv)
1456 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1457out_unlock:
1458 spin_unlock_irqrestore(&ev->lock, flags);
1459}
1460
1461/**
1462 * disk_block_events - block and flush disk event checking
1463 * @disk: disk to block events for
1464 *
1465 * On return from this function, it is guaranteed that event checking
1466 * isn't in progress and won't happen until unblocked by
1467 * disk_unblock_events(). Events blocking is counted and the actual
1468 * unblocking happens after the matching number of unblocks are done.
1469 *
1470 * Note that this intentionally does not block event checking from
1471 * disk_clear_events().
1472 *
1473 * CONTEXT:
1474 * Might sleep.
1475 */
1476void disk_block_events(struct gendisk *disk)
1477{
1478 if (disk->ev)
1479 __disk_block_events(disk, true);
1480}
1481
1482/**
1483 * disk_unblock_events - unblock disk event checking
1484 * @disk: disk to unblock events for
1485 *
1486 * Undo disk_block_events(). When the block count reaches zero, it
1487 * starts events polling if configured.
1488 *
1489 * CONTEXT:
1490 * Don't care. Safe to call from irq context.
1491 */
1492void disk_unblock_events(struct gendisk *disk)
1493{
1494 if (disk->ev)
1495 __disk_unblock_events(disk, true);
1496}
1497
1498/**
1499 * disk_check_events - schedule immediate event checking
1500 * @disk: disk to check events for
1501 *
1502 * Schedule immediate event checking on @disk if not blocked.
1503 *
1504 * CONTEXT:
1505 * Don't care. Safe to call from irq context.
1506 */
1507void disk_check_events(struct gendisk *disk)
1508{
1509 if (disk->ev) {
1510 __disk_block_events(disk, false);
1511 __disk_unblock_events(disk, true);
1512 }
1513}
1514EXPORT_SYMBOL_GPL(disk_check_events);
1515
1516/**
1517 * disk_clear_events - synchronously check, clear and return pending events
1518 * @disk: disk to fetch and clear events from
1519 * @mask: mask of events to be fetched and clearted
1520 *
1521 * Disk events are synchronously checked and pending events in @mask
1522 * are cleared and returned. This ignores the block count.
1523 *
1524 * CONTEXT:
1525 * Might sleep.
1526 */
1527unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1528{
1529 const struct block_device_operations *bdops = disk->fops;
1530 struct disk_events *ev = disk->ev;
1531 unsigned int pending;
1532
1533 if (!ev) {
1534 /* for drivers still using the old ->media_changed method */
1535 if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1536 bdops->media_changed && bdops->media_changed(disk))
1537 return DISK_EVENT_MEDIA_CHANGE;
1538 return 0;
1539 }
1540
1541 /* tell the workfn about the events being cleared */
1542 spin_lock_irq(&ev->lock);
1543 ev->clearing |= mask;
1544 spin_unlock_irq(&ev->lock);
1545
1546 /* uncondtionally schedule event check and wait for it to finish */
1547 __disk_block_events(disk, true);
1548 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1549 flush_delayed_work(&ev->dwork);
1550 __disk_unblock_events(disk, false);
1551
1552 /* then, fetch and clear pending events */
1553 spin_lock_irq(&ev->lock);
1554 WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */
1555 pending = ev->pending & mask;
1556 ev->pending &= ~mask;
1557 spin_unlock_irq(&ev->lock);
1558
1559 return pending;
1560}
1561
1562static void disk_events_workfn(struct work_struct *work)
1563{
1564 struct delayed_work *dwork = to_delayed_work(work);
1565 struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1566 struct gendisk *disk = ev->disk;
1567 char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1568 unsigned int clearing = ev->clearing;
1569 unsigned int events;
1570 unsigned long intv;
1571 int nr_events = 0, i;
1572
1573 /* check events */
1574 events = disk->fops->check_events(disk, clearing);
1575
1576 /* accumulate pending events and schedule next poll if necessary */
1577 spin_lock_irq(&ev->lock);
1578
1579 events &= ~ev->pending;
1580 ev->pending |= events;
1581 ev->clearing &= ~clearing;
1582
1583 intv = disk_events_poll_jiffies(disk);
1584 if (!ev->block && intv)
1585 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1586
1587 spin_unlock_irq(&ev->lock);
1588
1589 /* tell userland about new events */
1590 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1591 if (events & (1 << i))
1592 envp[nr_events++] = disk_uevents[i];
1593
1594 if (nr_events)
1595 kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1596}
1597
1598/*
1599 * A disk events enabled device has the following sysfs nodes under
1600 * its /sys/block/X/ directory.
1601 *
1602 * events : list of all supported events
1603 * events_async : list of events which can be detected w/o polling
1604 * events_poll_msecs : polling interval, 0: disable, -1: system default
1605 */
1606static ssize_t __disk_events_show(unsigned int events, char *buf)
1607{
1608 const char *delim = "";
1609 ssize_t pos = 0;
1610 int i;
1611
1612 for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1613 if (events & (1 << i)) {
1614 pos += sprintf(buf + pos, "%s%s",
1615 delim, disk_events_strs[i]);
1616 delim = " ";
1617 }
1618 if (pos)
1619 pos += sprintf(buf + pos, "\n");
1620 return pos;
1621}
1622
1623static ssize_t disk_events_show(struct device *dev,
1624 struct device_attribute *attr, char *buf)
1625{
1626 struct gendisk *disk = dev_to_disk(dev);
1627
1628 return __disk_events_show(disk->events, buf);
1629}
1630
1631static ssize_t disk_events_async_show(struct device *dev,
1632 struct device_attribute *attr, char *buf)
1633{
1634 struct gendisk *disk = dev_to_disk(dev);
1635
1636 return __disk_events_show(disk->async_events, buf);
1637}
1638
1639static ssize_t disk_events_poll_msecs_show(struct device *dev,
1640 struct device_attribute *attr,
1641 char *buf)
1642{
1643 struct gendisk *disk = dev_to_disk(dev);
1644
1645 return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1646}
1647
1648static ssize_t disk_events_poll_msecs_store(struct device *dev,
1649 struct device_attribute *attr,
1650 const char *buf, size_t count)
1651{
1652 struct gendisk *disk = dev_to_disk(dev);
1653 long intv;
1654
1655 if (!count || !sscanf(buf, "%ld", &intv))
1656 return -EINVAL;
1657
1658 if (intv < 0 && intv != -1)
1659 return -EINVAL;
1660
1661 __disk_block_events(disk, true);
1662 disk->ev->poll_msecs = intv;
1663 __disk_unblock_events(disk, true);
1664
1665 return count;
1666}
1667
1668static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
1669static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
1670static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
1671 disk_events_poll_msecs_show,
1672 disk_events_poll_msecs_store);
1673
1674static const struct attribute *disk_events_attrs[] = {
1675 &dev_attr_events.attr,
1676 &dev_attr_events_async.attr,
1677 &dev_attr_events_poll_msecs.attr,
1678 NULL,
1679};
1680
1681/*
1682 * The default polling interval can be specified by the kernel
1683 * parameter block.events_dfl_poll_msecs which defaults to 0
1684 * (disable). This can also be modified runtime by writing to
1685 * /sys/module/block/events_dfl_poll_msecs.
1686 */
1687static int disk_events_set_dfl_poll_msecs(const char *val,
1688 const struct kernel_param *kp)
1689{
1690 struct disk_events *ev;
1691 int ret;
1692
1693 ret = param_set_ulong(val, kp);
1694 if (ret < 0)
1695 return ret;
1696
1697 mutex_lock(&disk_events_mutex);
1698
1699 list_for_each_entry(ev, &disk_events, node)
1700 disk_check_events(ev->disk);
1701
1702 mutex_unlock(&disk_events_mutex);
1703
1704 return 0;
1705}
1706
1707static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1708 .set = disk_events_set_dfl_poll_msecs,
1709 .get = param_get_ulong,
1710};
1711
1712#undef MODULE_PARAM_PREFIX
1713#define MODULE_PARAM_PREFIX "block."
1714
1715module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1716 &disk_events_dfl_poll_msecs, 0644);
1717
1718/*
1719 * disk_{add|del|release}_events - initialize and destroy disk_events.
1720 */
1721static void disk_add_events(struct gendisk *disk)
1722{
1723 struct disk_events *ev;
1724
1725 if (!disk->fops->check_events || !(disk->events | disk->async_events))
1726 return;
1727
1728 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1729 if (!ev) {
1730 pr_warn("%s: failed to initialize events\n", disk->disk_name);
1731 return;
1732 }
1733
1734 if (sysfs_create_files(&disk_to_dev(disk)->kobj,
1735 disk_events_attrs) < 0) {
1736 pr_warn("%s: failed to create sysfs files for events\n",
1737 disk->disk_name);
1738 kfree(ev);
1739 return;
1740 }
1741
1742 disk->ev = ev;
1743
1744 INIT_LIST_HEAD(&ev->node);
1745 ev->disk = disk;
1746 spin_lock_init(&ev->lock);
1747 ev->block = 1;
1748 ev->poll_msecs = -1;
1749 INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1750
1751 mutex_lock(&disk_events_mutex);
1752 list_add_tail(&ev->node, &disk_events);
1753 mutex_unlock(&disk_events_mutex);
1754
1755 /*
1756 * Block count is initialized to 1 and the following initial
1757 * unblock kicks it into action.
1758 */
1759 __disk_unblock_events(disk, true);
1760}
1761
1762static void disk_del_events(struct gendisk *disk)
1763{
1764 if (!disk->ev)
1765 return;
1766
1767 __disk_block_events(disk, true);
1768
1769 mutex_lock(&disk_events_mutex);
1770 list_del_init(&disk->ev->node);
1771 mutex_unlock(&disk_events_mutex);
1772
1773 sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
1774}
1775
1776static void disk_release_events(struct gendisk *disk)
1777{
1778 /* the block count should be 1 from disk_del_events() */
1779 WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
1780 kfree(disk->ev);
1781}