diff options
Diffstat (limited to 'fs/inode.c')
-rw-r--r-- | fs/inode.c | 198 |
1 files changed, 170 insertions, 28 deletions
diff --git a/fs/inode.c b/fs/inode.c index 9f4f5fecc09..c99163b1b31 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -135,8 +135,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) | |||
135 | inode->i_fop = &empty_fops; | 135 | inode->i_fop = &empty_fops; |
136 | inode->__i_nlink = 1; | 136 | inode->__i_nlink = 1; |
137 | inode->i_opflags = 0; | 137 | inode->i_opflags = 0; |
138 | inode->i_uid = 0; | 138 | i_uid_write(inode, 0); |
139 | inode->i_gid = 0; | 139 | i_gid_write(inode, 0); |
140 | atomic_set(&inode->i_writecount, 0); | 140 | atomic_set(&inode->i_writecount, 0); |
141 | inode->i_size = 0; | 141 | inode->i_size = 0; |
142 | inode->i_blocks = 0; | 142 | inode->i_blocks = 0; |
@@ -486,7 +486,7 @@ void __remove_inode_hash(struct inode *inode) | |||
486 | } | 486 | } |
487 | EXPORT_SYMBOL(__remove_inode_hash); | 487 | EXPORT_SYMBOL(__remove_inode_hash); |
488 | 488 | ||
489 | void end_writeback(struct inode *inode) | 489 | void clear_inode(struct inode *inode) |
490 | { | 490 | { |
491 | might_sleep(); | 491 | might_sleep(); |
492 | /* | 492 | /* |
@@ -500,11 +500,10 @@ void end_writeback(struct inode *inode) | |||
500 | BUG_ON(!list_empty(&inode->i_data.private_list)); | 500 | BUG_ON(!list_empty(&inode->i_data.private_list)); |
501 | BUG_ON(!(inode->i_state & I_FREEING)); | 501 | BUG_ON(!(inode->i_state & I_FREEING)); |
502 | BUG_ON(inode->i_state & I_CLEAR); | 502 | BUG_ON(inode->i_state & I_CLEAR); |
503 | inode_sync_wait(inode); | ||
504 | /* don't need i_lock here, no concurrent mods to i_state */ | 503 | /* don't need i_lock here, no concurrent mods to i_state */ |
505 | inode->i_state = I_FREEING | I_CLEAR; | 504 | inode->i_state = I_FREEING | I_CLEAR; |
506 | } | 505 | } |
507 | EXPORT_SYMBOL(end_writeback); | 506 | EXPORT_SYMBOL(clear_inode); |
508 | 507 | ||
509 | /* | 508 | /* |
510 | * Free the inode passed in, removing it from the lists it is still connected | 509 | * Free the inode passed in, removing it from the lists it is still connected |
@@ -531,12 +530,20 @@ static void evict(struct inode *inode) | |||
531 | 530 | ||
532 | inode_sb_list_del(inode); | 531 | inode_sb_list_del(inode); |
533 | 532 | ||
533 | /* | ||
534 | * Wait for flusher thread to be done with the inode so that filesystem | ||
535 | * does not start destroying it while writeback is still running. Since | ||
536 | * the inode has I_FREEING set, flusher thread won't start new work on | ||
537 | * the inode. We just have to wait for running writeback to finish. | ||
538 | */ | ||
539 | inode_wait_for_writeback(inode); | ||
540 | |||
534 | if (op->evict_inode) { | 541 | if (op->evict_inode) { |
535 | op->evict_inode(inode); | 542 | op->evict_inode(inode); |
536 | } else { | 543 | } else { |
537 | if (inode->i_data.nrpages) | 544 | if (inode->i_data.nrpages) |
538 | truncate_inode_pages(&inode->i_data, 0); | 545 | truncate_inode_pages(&inode->i_data, 0); |
539 | end_writeback(inode); | 546 | clear_inode(inode); |
540 | } | 547 | } |
541 | if (S_ISBLK(inode->i_mode) && inode->i_bdev) | 548 | if (S_ISBLK(inode->i_mode) && inode->i_bdev) |
542 | bd_forget(inode); | 549 | bd_forget(inode); |
@@ -1480,10 +1487,30 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, | |||
1480 | return 0; | 1487 | return 0; |
1481 | } | 1488 | } |
1482 | 1489 | ||
1490 | /* | ||
1491 | * This does the actual work of updating an inodes time or version. Must have | ||
1492 | * had called mnt_want_write() before calling this. | ||
1493 | */ | ||
1494 | static int update_time(struct inode *inode, struct timespec *time, int flags) | ||
1495 | { | ||
1496 | if (inode->i_op->update_time) | ||
1497 | return inode->i_op->update_time(inode, time, flags); | ||
1498 | |||
1499 | if (flags & S_ATIME) | ||
1500 | inode->i_atime = *time; | ||
1501 | if (flags & S_VERSION) | ||
1502 | inode_inc_iversion(inode); | ||
1503 | if (flags & S_CTIME) | ||
1504 | inode->i_ctime = *time; | ||
1505 | if (flags & S_MTIME) | ||
1506 | inode->i_mtime = *time; | ||
1507 | mark_inode_dirty_sync(inode); | ||
1508 | return 0; | ||
1509 | } | ||
1510 | |||
1483 | /** | 1511 | /** |
1484 | * touch_atime - update the access time | 1512 | * touch_atime - update the access time |
1485 | * @mnt: mount the inode is accessed on | 1513 | * @path: the &struct path to update |
1486 | * @dentry: dentry accessed | ||
1487 | * | 1514 | * |
1488 | * Update the accessed time on an inode and mark it for writeback. | 1515 | * Update the accessed time on an inode and mark it for writeback. |
1489 | * This function automatically handles read only file systems and media, | 1516 | * This function automatically handles read only file systems and media, |
@@ -1518,12 +1545,83 @@ void touch_atime(struct path *path) | |||
1518 | if (mnt_want_write(mnt)) | 1545 | if (mnt_want_write(mnt)) |
1519 | return; | 1546 | return; |
1520 | 1547 | ||
1521 | inode->i_atime = now; | 1548 | /* |
1522 | mark_inode_dirty_sync(inode); | 1549 | * File systems can error out when updating inodes if they need to |
1550 | * allocate new space to modify an inode (such is the case for | ||
1551 | * Btrfs), but since we touch atime while walking down the path we | ||
1552 | * really don't care if we failed to update the atime of the file, | ||
1553 | * so just ignore the return value. | ||
1554 | */ | ||
1555 | update_time(inode, &now, S_ATIME); | ||
1523 | mnt_drop_write(mnt); | 1556 | mnt_drop_write(mnt); |
1524 | } | 1557 | } |
1525 | EXPORT_SYMBOL(touch_atime); | 1558 | EXPORT_SYMBOL(touch_atime); |
1526 | 1559 | ||
1560 | /* | ||
1561 | * The logic we want is | ||
1562 | * | ||
1563 | * if suid or (sgid and xgrp) | ||
1564 | * remove privs | ||
1565 | */ | ||
1566 | int should_remove_suid(struct dentry *dentry) | ||
1567 | { | ||
1568 | umode_t mode = dentry->d_inode->i_mode; | ||
1569 | int kill = 0; | ||
1570 | |||
1571 | /* suid always must be killed */ | ||
1572 | if (unlikely(mode & S_ISUID)) | ||
1573 | kill = ATTR_KILL_SUID; | ||
1574 | |||
1575 | /* | ||
1576 | * sgid without any exec bits is just a mandatory locking mark; leave | ||
1577 | * it alone. If some exec bits are set, it's a real sgid; kill it. | ||
1578 | */ | ||
1579 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | ||
1580 | kill |= ATTR_KILL_SGID; | ||
1581 | |||
1582 | if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) | ||
1583 | return kill; | ||
1584 | |||
1585 | return 0; | ||
1586 | } | ||
1587 | EXPORT_SYMBOL(should_remove_suid); | ||
1588 | |||
1589 | static int __remove_suid(struct dentry *dentry, int kill) | ||
1590 | { | ||
1591 | struct iattr newattrs; | ||
1592 | |||
1593 | newattrs.ia_valid = ATTR_FORCE | kill; | ||
1594 | return notify_change(dentry, &newattrs); | ||
1595 | } | ||
1596 | |||
1597 | int file_remove_suid(struct file *file) | ||
1598 | { | ||
1599 | struct dentry *dentry = file->f_path.dentry; | ||
1600 | struct inode *inode = dentry->d_inode; | ||
1601 | int killsuid; | ||
1602 | int killpriv; | ||
1603 | int error = 0; | ||
1604 | |||
1605 | /* Fast path for nothing security related */ | ||
1606 | if (IS_NOSEC(inode)) | ||
1607 | return 0; | ||
1608 | |||
1609 | killsuid = should_remove_suid(dentry); | ||
1610 | killpriv = security_inode_need_killpriv(dentry); | ||
1611 | |||
1612 | if (killpriv < 0) | ||
1613 | return killpriv; | ||
1614 | if (killpriv) | ||
1615 | error = security_inode_killpriv(dentry); | ||
1616 | if (!error && killsuid) | ||
1617 | error = __remove_suid(dentry, killsuid); | ||
1618 | if (!error && (inode->i_sb->s_flags & MS_NOSEC)) | ||
1619 | inode->i_flags |= S_NOSEC; | ||
1620 | |||
1621 | return error; | ||
1622 | } | ||
1623 | EXPORT_SYMBOL(file_remove_suid); | ||
1624 | |||
1527 | /** | 1625 | /** |
1528 | * file_update_time - update mtime and ctime time | 1626 | * file_update_time - update mtime and ctime time |
1529 | * @file: file accessed | 1627 | * @file: file accessed |
@@ -1533,18 +1631,20 @@ EXPORT_SYMBOL(touch_atime); | |||
1533 | * usage in the file write path of filesystems, and filesystems may | 1631 | * usage in the file write path of filesystems, and filesystems may |
1534 | * choose to explicitly ignore update via this function with the | 1632 | * choose to explicitly ignore update via this function with the |
1535 | * S_NOCMTIME inode flag, e.g. for network filesystem where these | 1633 | * S_NOCMTIME inode flag, e.g. for network filesystem where these |
1536 | * timestamps are handled by the server. | 1634 | * timestamps are handled by the server. This can return an error for |
1635 | * file systems who need to allocate space in order to update an inode. | ||
1537 | */ | 1636 | */ |
1538 | 1637 | ||
1539 | void file_update_time(struct file *file) | 1638 | int file_update_time(struct file *file) |
1540 | { | 1639 | { |
1541 | struct inode *inode = file->f_path.dentry->d_inode; | 1640 | struct inode *inode = file->f_path.dentry->d_inode; |
1542 | struct timespec now; | 1641 | struct timespec now; |
1543 | enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; | 1642 | int sync_it = 0; |
1643 | int ret; | ||
1544 | 1644 | ||
1545 | /* First try to exhaust all avenues to not sync */ | 1645 | /* First try to exhaust all avenues to not sync */ |
1546 | if (IS_NOCMTIME(inode)) | 1646 | if (IS_NOCMTIME(inode)) |
1547 | return; | 1647 | return 0; |
1548 | 1648 | ||
1549 | now = current_fs_time(inode->i_sb); | 1649 | now = current_fs_time(inode->i_sb); |
1550 | if (!timespec_equal(&inode->i_mtime, &now)) | 1650 | if (!timespec_equal(&inode->i_mtime, &now)) |
@@ -1557,21 +1657,16 @@ void file_update_time(struct file *file) | |||
1557 | sync_it |= S_VERSION; | 1657 | sync_it |= S_VERSION; |
1558 | 1658 | ||
1559 | if (!sync_it) | 1659 | if (!sync_it) |
1560 | return; | 1660 | return 0; |
1561 | 1661 | ||
1562 | /* Finally allowed to write? Takes lock. */ | 1662 | /* Finally allowed to write? Takes lock. */ |
1563 | if (mnt_want_write_file(file)) | 1663 | if (mnt_want_write_file(file)) |
1564 | return; | 1664 | return 0; |
1565 | 1665 | ||
1566 | /* Only change inode inside the lock region */ | 1666 | ret = update_time(inode, &now, sync_it); |
1567 | if (sync_it & S_VERSION) | ||
1568 | inode_inc_iversion(inode); | ||
1569 | if (sync_it & S_CTIME) | ||
1570 | inode->i_ctime = now; | ||
1571 | if (sync_it & S_MTIME) | ||
1572 | inode->i_mtime = now; | ||
1573 | mark_inode_dirty_sync(inode); | ||
1574 | mnt_drop_write_file(file); | 1667 | mnt_drop_write_file(file); |
1668 | |||
1669 | return ret; | ||
1575 | } | 1670 | } |
1576 | EXPORT_SYMBOL(file_update_time); | 1671 | EXPORT_SYMBOL(file_update_time); |
1577 | 1672 | ||
@@ -1647,6 +1742,7 @@ void __init inode_init_early(void) | |||
1647 | HASH_EARLY, | 1742 | HASH_EARLY, |
1648 | &i_hash_shift, | 1743 | &i_hash_shift, |
1649 | &i_hash_mask, | 1744 | &i_hash_mask, |
1745 | 0, | ||
1650 | 0); | 1746 | 0); |
1651 | 1747 | ||
1652 | for (loop = 0; loop < (1U << i_hash_shift); loop++) | 1748 | for (loop = 0; loop < (1U << i_hash_shift); loop++) |
@@ -1677,6 +1773,7 @@ void __init inode_init(void) | |||
1677 | 0, | 1773 | 0, |
1678 | &i_hash_shift, | 1774 | &i_hash_shift, |
1679 | &i_hash_mask, | 1775 | &i_hash_mask, |
1776 | 0, | ||
1680 | 0); | 1777 | 0); |
1681 | 1778 | ||
1682 | for (loop = 0; loop < (1U << i_hash_shift); loop++) | 1779 | for (loop = 0; loop < (1U << i_hash_shift); loop++) |
@@ -1732,12 +1829,57 @@ EXPORT_SYMBOL(inode_init_owner); | |||
1732 | */ | 1829 | */ |
1733 | bool inode_owner_or_capable(const struct inode *inode) | 1830 | bool inode_owner_or_capable(const struct inode *inode) |
1734 | { | 1831 | { |
1735 | struct user_namespace *ns = inode_userns(inode); | 1832 | if (uid_eq(current_fsuid(), inode->i_uid)) |
1736 | |||
1737 | if (current_user_ns() == ns && current_fsuid() == inode->i_uid) | ||
1738 | return true; | 1833 | return true; |
1739 | if (ns_capable(ns, CAP_FOWNER)) | 1834 | if (inode_capable(inode, CAP_FOWNER)) |
1740 | return true; | 1835 | return true; |
1741 | return false; | 1836 | return false; |
1742 | } | 1837 | } |
1743 | EXPORT_SYMBOL(inode_owner_or_capable); | 1838 | EXPORT_SYMBOL(inode_owner_or_capable); |
1839 | |||
1840 | /* | ||
1841 | * Direct i/o helper functions | ||
1842 | */ | ||
1843 | static void __inode_dio_wait(struct inode *inode) | ||
1844 | { | ||
1845 | wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); | ||
1846 | DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); | ||
1847 | |||
1848 | do { | ||
1849 | prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); | ||
1850 | if (atomic_read(&inode->i_dio_count)) | ||
1851 | schedule(); | ||
1852 | } while (atomic_read(&inode->i_dio_count)); | ||
1853 | finish_wait(wq, &q.wait); | ||
1854 | } | ||
1855 | |||
1856 | /** | ||
1857 | * inode_dio_wait - wait for outstanding DIO requests to finish | ||
1858 | * @inode: inode to wait for | ||
1859 | * | ||
1860 | * Waits for all pending direct I/O requests to finish so that we can | ||
1861 | * proceed with a truncate or equivalent operation. | ||
1862 | * | ||
1863 | * Must be called under a lock that serializes taking new references | ||
1864 | * to i_dio_count, usually by inode->i_mutex. | ||
1865 | */ | ||
1866 | void inode_dio_wait(struct inode *inode) | ||
1867 | { | ||
1868 | if (atomic_read(&inode->i_dio_count)) | ||
1869 | __inode_dio_wait(inode); | ||
1870 | } | ||
1871 | EXPORT_SYMBOL(inode_dio_wait); | ||
1872 | |||
1873 | /* | ||
1874 | * inode_dio_done - signal finish of a direct I/O requests | ||
1875 | * @inode: inode the direct I/O happens on | ||
1876 | * | ||
1877 | * This is called once we've finished processing a direct I/O request, | ||
1878 | * and is used to wake up callers waiting for direct I/O to be quiesced. | ||
1879 | */ | ||
1880 | void inode_dio_done(struct inode *inode) | ||
1881 | { | ||
1882 | if (atomic_dec_and_test(&inode->i_dio_count)) | ||
1883 | wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); | ||
1884 | } | ||
1885 | EXPORT_SYMBOL(inode_dio_done); | ||