aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ntfs.txt42
-rw-r--r--fs/ntfs/ChangeLog85
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c832
-rw-r--r--fs/ntfs/attrib.c983
-rw-r--r--fs/ntfs/attrib.h10
-rw-r--r--fs/ntfs/file.c2255
-rw-r--r--fs/ntfs/inode.c514
-rw-r--r--fs/ntfs/layout.h31
-rw-r--r--fs/ntfs/lcnalloc.c56
-rw-r--r--fs/ntfs/lcnalloc.h43
-rw-r--r--fs/ntfs/malloc.h3
-rw-r--r--fs/ntfs/mft.c26
-rw-r--r--fs/ntfs/super.c2
14 files changed, 3841 insertions, 1043 deletions
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index a5fbc8e897fa..614de3124901 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -50,9 +50,14 @@ userspace utilities, etc.
50Features 50Features
51======== 51========
52 52
53- This is a complete rewrite of the NTFS driver that used to be in the kernel. 53- This is a complete rewrite of the NTFS driver that used to be in the 2.4 and
54 This new driver implements NTFS read support and is functionally equivalent 54 earlier kernels. This new driver implements NTFS read support and is
55 to the old ntfs driver. 55 functionally equivalent to the old ntfs driver and it also implements limited
56 write support. The biggest limitation at present is that files/directories
57 cannot be created or deleted. See below for the list of write features that
58 are so far supported. Another limitation is that writing to compressed files
59 is not implemented at all. Also, neither read nor write access to encrypted
60 files is so far implemented.
56- The new driver has full support for sparse files on NTFS 3.x volumes which 61- The new driver has full support for sparse files on NTFS 3.x volumes which
57 the old driver isn't happy with. 62 the old driver isn't happy with.
58- The new driver supports execution of binaries due to mmap() now being 63- The new driver supports execution of binaries due to mmap() now being
@@ -78,7 +83,20 @@ Features
78- The new driver supports fsync(2), fdatasync(2), and msync(2). 83- The new driver supports fsync(2), fdatasync(2), and msync(2).
79- The new driver supports readv(2) and writev(2). 84- The new driver supports readv(2) and writev(2).
80- The new driver supports access time updates (including mtime and ctime). 85- The new driver supports access time updates (including mtime and ctime).
81 86- The new driver supports truncate(2) and open(2) with O_TRUNC. But at present
87 only very limited support for highly fragmented files, i.e. ones which have
88 their data attribute split across multiple extents, is included. Another
89 limitation is that at present truncate(2) will never create sparse files,
90 since to mark a file sparse we need to modify the directory entry for the
91 file and we do not implement directory modifications yet.
92- The new driver supports write(2) which can both overwrite existing data and
93 extend the file size so that you can write beyond the existing data. Also,
94 writing into sparse regions is supported and the holes are filled in with
95 clusters. But at present only limited support for highly fragmented files,
96 i.e. ones which have their data attribute split across multiple extents, is
97 included. Another limitation is that write(2) will never create sparse
98 files, since to mark a file sparse we need to modify the directory entry for
99 the file and we do not implement directory modifications yet.
82 100
83Supported mount options 101Supported mount options
84======================= 102=======================
@@ -439,6 +457,22 @@ ChangeLog
439 457
440Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. 458Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
441 459
4602.1.25:
461 - Write support is now extended with write(2) being able to both
462 overwrite existing file data and to extend files. Also, if a write
463 to a sparse region occurs, write(2) will fill in the hole. Note,
464 mmap(2) based writes still do not support writing into holes or
465 writing beyond the initialized size.
466 - Write support has a new feature and that is that truncate(2) and
467 open(2) with O_TRUNC are now implemented thus files can be both made
468 smaller and larger.
469 - Note: Both write(2) and truncate(2)/open(2) with O_TRUNC still have
470 limitations in that they
471 - only provide limited support for highly fragmented files.
472 - only work on regular, i.e. uncompressed and unencrypted files.
473 - never create sparse files although this will change once directory
474 operations are implemented.
475 - Lots of bug fixes and enhancements across the board.
4422.1.24: 4762.1.24:
443 - Support journals ($LogFile) which have been modified by chkdsk. This 477 - Support journals ($LogFile) which have been modified by chkdsk. This
444 means users can boot into Windows after we marked the volume dirty. 478 means users can boot into Windows after we marked the volume dirty.
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index de58579a1d0e..50a7749cfca1 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -1,18 +1,15 @@
1ToDo/Notes: 1ToDo/Notes:
2 - Find and fix bugs. 2 - Find and fix bugs.
3 - In between ntfs_prepare/commit_write, need exclusion between 3 - The only places in the kernel where a file is resized are
4 simultaneous file extensions. This is given to us by holding i_sem 4 ntfs_file_write*() and ntfs_truncate() for both of which i_sem is
5 on the inode. The only places in the kernel when a file is resized 5 held. Just have to be careful in read-/writepage and other helpers
6 are prepare/commit write and truncate for both of which i_sem is 6 not running under i_sem that we play nice... Also need to be careful
7 held. Just have to be careful in readpage/writepage and all other 7 with initialized_size extension in ntfs_file_write*() and writepage.
8 helpers not running under i_sem that we play nice... 8 UPDATE: The only things that need to be checked are the compressed
9 Also need to be careful with initialized_size extention in 9 write and the other attribute resize/write cases like index
10 ntfs_prepare_write. Basically, just be _very_ careful in this code... 10 attributes, etc. For now none of these are implemented so are safe.
11 UPDATE: The only things that need to be checked are read/writepage 11 - Implement filling in of holes in aops.c::ntfs_writepage() and its
12 which do not hold i_sem. Note writepage cannot change i_size but it 12 helpers.
13 needs to cope with a concurrent i_size change, just like readpage.
14 Also both need to cope with concurrent changes to the other sizes,
15 i.e. initialized/allocated/compressed size, as well.
16 - Implement mft.c::sync_mft_mirror_umount(). We currently will just 13 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
17 leave the volume dirty on umount if the final iput(vol->mft_ino) 14 leave the volume dirty on umount if the final iput(vol->mft_ino)
18 causes a write of any mirrored mft records due to the mft mirror 15 causes a write of any mirrored mft records due to the mft mirror
@@ -22,6 +19,68 @@ ToDo/Notes:
22 - Enable the code for setting the NT4 compatibility flag when we start 19 - Enable the code for setting the NT4 compatibility flag when we start
23 making NTFS 1.2 specific modifications. 20 making NTFS 1.2 specific modifications.
24 21
222.1.25 - (Almost) fully implement write(2) and truncate(2).
23
24 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
25 {__,}ntfs_cluster_free() to also take an optional attribute search
26 context as argument. This allows calling these functions with the
27 mft record mapped. Update all callers.
28 - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
29 error handling by passing in the active search context when calling
30 ntfs_cluster_free().
31 - Change ntfs_cluster_alloc() to take an extra boolean parameter
32 specifying whether the cluster are being allocated to extend an
33 attribute or to fill a hole.
34 - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
35 with @is_extension set to TRUE and remove the runlist terminator
36 fixup code as this is now done by ntfs_cluster_alloc().
37 - Change ntfs_attr_make_non_resident to take the attribute value size
38 as an extra parameter. This is needed since we need to know the size
39 before we can map the mft record and our callers always know it. The
40 reason we cannot simply read the size from the vfs inode i_size is
41 that this is not necessarily uptodate. This happens when
42 ntfs_attr_make_non_resident() is called in the ->truncate call path.
43 - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
44 which is zero for a resident attribute but should no longer be zero
45 once the attribute is non-resident as it then has real clusters
46 allocated.
47 - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
48 extend the allocation of an attributes. Optionally, the data size,
49 but not the initialized size can be extended, too.
50 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
51 uncompressed and unencrypted files and it never creates sparse files
52 at least for the moment (making a file sparse requires us to modify
53 its directory entries and we do not support directory operations at
54 the moment). Also, support for highly fragmented files, i.e. ones
55 whose data attribute is split across multiple extents, is severly
56 limited. When such a case is encountered, EOPNOTSUPP is returned.
57 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
58 the initial implementation of file truncation. Now both open(2)ing
59 a file with the O_TRUNC flag and the {,f}truncate(2) system calls
60 will resize a file appropriately. The limitations are that only
61 uncompressed and unencrypted files are supported. Also, there is
62 only very limited support for highly fragmented files (the ones whose
63 $DATA attribute is split into multiple attribute extents).
64 - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
65 and cond_resched() in the main loop as we could be dirtying a lot of
66 pages and this ensures we play nice with the VM and the system as a
67 whole.
68 - Implement file operations ->write, ->aio_write, ->writev for regular
69 files. This replaces the old use of generic_file_write(), et al and
70 the address space operations ->prepare_write and ->commit_write.
71 This means that both sparse and non-sparse (unencrypted and
72 uncompressed) files can now be extended using the normal write(2)
73 code path. There are two limitations at present and these are that
74 we never create sparse files and that we only have limited support
75 for highly fragmented files, i.e. ones whose data attribute is split
76 across multiple extents. When such a case is encountered,
77 EOPNOTSUPP is returned.
78 - $EA attributes can be both resident and non-resident.
79 - Use %z for size_t to fix compilation warnings. (Andrew Morton)
80 - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
81 - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
82 patch by Yura Pakhuchiy.)
83
252.1.24 - Lots of bug fixes and support more clean journal states. 842.1.24 - Lots of bug fixes and support more clean journal states.
26 85
27 - Support journals ($LogFile) which have been modified by chkdsk. This 86 - Support journals ($LogFile) which have been modified by chkdsk. This
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 894b2b876d35..d0d45d1c853a 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 5e80c07c6a4d..1c0a4315876a 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1391,8 +1391,7 @@ retry_writepage:
1391 if (NInoEncrypted(ni)) { 1391 if (NInoEncrypted(ni)) {
1392 unlock_page(page); 1392 unlock_page(page);
1393 BUG_ON(ni->type != AT_DATA); 1393 BUG_ON(ni->type != AT_DATA);
1394 ntfs_debug("Denying write access to encrypted " 1394 ntfs_debug("Denying write access to encrypted file.");
1395 "file.");
1396 return -EACCES; 1395 return -EACCES;
1397 } 1396 }
1398 /* Compressed data streams are handled in compress.c. */ 1397 /* Compressed data streams are handled in compress.c. */
@@ -1508,8 +1507,8 @@ retry_writepage:
1508 /* Zero out of bounds area in the page cache page. */ 1507 /* Zero out of bounds area in the page cache page. */
1509 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 1508 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1510 kunmap_atomic(kaddr, KM_USER0); 1509 kunmap_atomic(kaddr, KM_USER0);
1511 flush_dcache_mft_record_page(ctx->ntfs_ino);
1512 flush_dcache_page(page); 1510 flush_dcache_page(page);
1511 flush_dcache_mft_record_page(ctx->ntfs_ino);
1513 /* We are done with the page. */ 1512 /* We are done with the page. */
1514 end_page_writeback(page); 1513 end_page_writeback(page);
1515 /* Finally, mark the mft record dirty, so it gets written back. */ 1514 /* Finally, mark the mft record dirty, so it gets written back. */
@@ -1542,830 +1541,6 @@ err_out:
1542 return err; 1541 return err;
1543} 1542}
1544 1543
1545/**
1546 * ntfs_prepare_nonresident_write -
1547 *
1548 */
1549static int ntfs_prepare_nonresident_write(struct page *page,
1550 unsigned from, unsigned to)
1551{
1552 VCN vcn;
1553 LCN lcn;
1554 s64 initialized_size;
1555 loff_t i_size;
1556 sector_t block, ablock, iblock;
1557 struct inode *vi;
1558 ntfs_inode *ni;
1559 ntfs_volume *vol;
1560 runlist_element *rl;
1561 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1562 unsigned long flags;
1563 unsigned int vcn_ofs, block_start, block_end, blocksize;
1564 int err;
1565 BOOL is_retry;
1566 unsigned char blocksize_bits;
1567
1568 vi = page->mapping->host;
1569 ni = NTFS_I(vi);
1570 vol = ni->vol;
1571
1572 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1573 "0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
1574 page->index, from, to);
1575
1576 BUG_ON(!NInoNonResident(ni));
1577
1578 blocksize_bits = vi->i_blkbits;
1579 blocksize = 1 << blocksize_bits;
1580
1581 /*
1582 * create_empty_buffers() will create uptodate/dirty buffers if the
1583 * page is uptodate/dirty.
1584 */
1585 if (!page_has_buffers(page))
1586 create_empty_buffers(page, blocksize, 0);
1587 bh = head = page_buffers(page);
1588 if (unlikely(!bh))
1589 return -ENOMEM;
1590
1591 /* The first block in the page. */
1592 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
1593
1594 read_lock_irqsave(&ni->size_lock, flags);
1595 /*
1596 * The first out of bounds block for the allocated size. No need to
1597 * round up as allocated_size is in multiples of cluster size and the
1598 * minimum cluster size is 512 bytes, which is equal to the smallest
1599 * blocksize.
1600 */
1601 ablock = ni->allocated_size >> blocksize_bits;
1602 i_size = i_size_read(vi);
1603 initialized_size = ni->initialized_size;
1604 read_unlock_irqrestore(&ni->size_lock, flags);
1605
1606 /* The last (fully or partially) initialized block. */
1607 iblock = initialized_size >> blocksize_bits;
1608
1609 /* Loop through all the buffers in the page. */
1610 block_start = 0;
1611 rl = NULL;
1612 err = 0;
1613 do {
1614 block_end = block_start + blocksize;
1615 /*
1616 * If buffer @bh is outside the write, just mark it uptodate
1617 * if the page is uptodate and continue with the next buffer.
1618 */
1619 if (block_end <= from || block_start >= to) {
1620 if (PageUptodate(page)) {
1621 if (!buffer_uptodate(bh))
1622 set_buffer_uptodate(bh);
1623 }
1624 continue;
1625 }
1626 /*
1627 * @bh is at least partially being written to.
1628 * Make sure it is not marked as new.
1629 */
1630 //if (buffer_new(bh))
1631 // clear_buffer_new(bh);
1632
1633 if (block >= ablock) {
1634 // TODO: block is above allocated_size, need to
1635 // allocate it. Best done in one go to accommodate not
1636 // only block but all above blocks up to and including:
1637 // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
1638 // - 1) >> blobksize_bits. Obviously will need to round
1639 // up to next cluster boundary, too. This should be
1640 // done with a helper function, so it can be reused.
1641 ntfs_error(vol->sb, "Writing beyond allocated size "
1642 "is not supported yet. Sorry.");
1643 err = -EOPNOTSUPP;
1644 goto err_out;
1645 // Need to update ablock.
1646 // Need to set_buffer_new() on all block bhs that are
1647 // newly allocated.
1648 }
1649 /*
1650 * Now we have enough allocated size to fulfill the whole
1651 * request, i.e. block < ablock is true.
1652 */
1653 if (unlikely((block >= iblock) &&
1654 (initialized_size < i_size))) {
1655 /*
1656 * If this page is fully outside initialized size, zero
1657 * out all pages between the current initialized size
1658 * and the current page. Just use ntfs_readpage() to do
1659 * the zeroing transparently.
1660 */
1661 if (block > iblock) {
1662 // TODO:
1663 // For each page do:
1664 // - read_cache_page()
1665 // Again for each page do:
1666 // - wait_on_page_locked()
1667 // - Check (PageUptodate(page) &&
1668 // !PageError(page))
1669 // Update initialized size in the attribute and
1670 // in the inode.
1671 // Again, for each page do:
1672 // __set_page_dirty_buffers();
1673 // page_cache_release()
1674 // We don't need to wait on the writes.
1675 // Update iblock.
1676 }
1677 /*
1678 * The current page straddles initialized size. Zero
1679 * all non-uptodate buffers and set them uptodate (and
1680 * dirty?). Note, there aren't any non-uptodate buffers
1681 * if the page is uptodate.
1682 * FIXME: For an uptodate page, the buffers may need to
1683 * be written out because they were not initialized on
1684 * disk before.
1685 */
1686 if (!PageUptodate(page)) {
1687 // TODO:
1688 // Zero any non-uptodate buffers up to i_size.
1689 // Set them uptodate and dirty.
1690 }
1691 // TODO:
1692 // Update initialized size in the attribute and in the
1693 // inode (up to i_size).
1694 // Update iblock.
1695 // FIXME: This is inefficient. Try to batch the two
1696 // size changes to happen in one go.
1697 ntfs_error(vol->sb, "Writing beyond initialized size "
1698 "is not supported yet. Sorry.");
1699 err = -EOPNOTSUPP;
1700 goto err_out;
1701 // Do NOT set_buffer_new() BUT DO clear buffer range
1702 // outside write request range.
1703 // set_buffer_uptodate() on complete buffers as well as
1704 // set_buffer_dirty().
1705 }
1706
1707 /* Need to map unmapped buffers. */
1708 if (!buffer_mapped(bh)) {
1709 /* Unmapped buffer. Need to map it. */
1710 bh->b_bdev = vol->sb->s_bdev;
1711
1712 /* Convert block into corresponding vcn and offset. */
1713 vcn = (VCN)block << blocksize_bits >>
1714 vol->cluster_size_bits;
1715 vcn_ofs = ((VCN)block << blocksize_bits) &
1716 vol->cluster_size_mask;
1717
1718 is_retry = FALSE;
1719 if (!rl) {
1720lock_retry_remap:
1721 down_read(&ni->runlist.lock);
1722 rl = ni->runlist.rl;
1723 }
1724 if (likely(rl != NULL)) {
1725 /* Seek to element containing target vcn. */
1726 while (rl->length && rl[1].vcn <= vcn)
1727 rl++;
1728 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1729 } else
1730 lcn = LCN_RL_NOT_MAPPED;
1731 if (unlikely(lcn < 0)) {
1732 /*
1733 * We extended the attribute allocation above.
1734 * If we hit an ENOENT here it means that the
1735 * allocation was insufficient which is a bug.
1736 */
1737 BUG_ON(lcn == LCN_ENOENT);
1738
1739 /* It is a hole, need to instantiate it. */
1740 if (lcn == LCN_HOLE) {
1741 // TODO: Instantiate the hole.
1742 // clear_buffer_new(bh);
1743 // unmap_underlying_metadata(bh->b_bdev,
1744 // bh->b_blocknr);
1745 // For non-uptodate buffers, need to
1746 // zero out the region outside the
1747 // request in this bh or all bhs,
1748 // depending on what we implemented
1749 // above.
1750 // Need to flush_dcache_page().
1751 // Or could use set_buffer_new()
1752 // instead?
1753 ntfs_error(vol->sb, "Writing into "
1754 "sparse regions is "
1755 "not supported yet. "
1756 "Sorry.");
1757 err = -EOPNOTSUPP;
1758 if (!rl)
1759 up_read(&ni->runlist.lock);
1760 goto err_out;
1761 } else if (!is_retry &&
1762 lcn == LCN_RL_NOT_MAPPED) {
1763 is_retry = TRUE;
1764 /*
1765 * Attempt to map runlist, dropping
1766 * lock for the duration.
1767 */
1768 up_read(&ni->runlist.lock);
1769 err = ntfs_map_runlist(ni, vcn);
1770 if (likely(!err))
1771 goto lock_retry_remap;
1772 rl = NULL;
1773 } else if (!rl)
1774 up_read(&ni->runlist.lock);
1775 /*
1776 * Failed to map the buffer, even after
1777 * retrying.
1778 */
1779 if (!err)
1780 err = -EIO;
1781 bh->b_blocknr = -1;
1782 ntfs_error(vol->sb, "Failed to write to inode "
1783 "0x%lx, attribute type 0x%x, "
1784 "vcn 0x%llx, offset 0x%x "
1785 "because its location on disk "
1786 "could not be determined%s "
1787 "(error code %i).",
1788 ni->mft_no, ni->type,
1789 (unsigned long long)vcn,
1790 vcn_ofs, is_retry ? " even "
1791 "after retrying" : "", err);
1792 goto err_out;
1793 }
1794 /* We now have a successful remap, i.e. lcn >= 0. */
1795
1796 /* Setup buffer head to correct block. */
1797 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
1798 + vcn_ofs) >> blocksize_bits;
1799 set_buffer_mapped(bh);
1800
1801 // FIXME: Something analogous to this is needed for
1802 // each newly allocated block, i.e. BH_New.
1803 // FIXME: Might need to take this out of the
1804 // if (!buffer_mapped(bh)) {}, depending on how we
1805 // implement things during the allocated_size and
1806 // initialized_size extension code above.
1807 if (buffer_new(bh)) {
1808 clear_buffer_new(bh);
1809 unmap_underlying_metadata(bh->b_bdev,
1810 bh->b_blocknr);
1811 if (PageUptodate(page)) {
1812 set_buffer_uptodate(bh);
1813 continue;
1814 }
1815 /*
1816 * Page is _not_ uptodate, zero surrounding
1817 * region. NOTE: This is how we decide if to
1818 * zero or not!
1819 */
1820 if (block_end > to || block_start < from) {
1821 void *kaddr;
1822
1823 kaddr = kmap_atomic(page, KM_USER0);
1824 if (block_end > to)
1825 memset(kaddr + to, 0,
1826 block_end - to);
1827 if (block_start < from)
1828 memset(kaddr + block_start, 0,
1829 from -
1830 block_start);
1831 flush_dcache_page(page);
1832 kunmap_atomic(kaddr, KM_USER0);
1833 }
1834 continue;
1835 }
1836 }
1837 /* @bh is mapped, set it uptodate if the page is uptodate. */
1838 if (PageUptodate(page)) {
1839 if (!buffer_uptodate(bh))
1840 set_buffer_uptodate(bh);
1841 continue;
1842 }
1843 /*
1844 * The page is not uptodate. The buffer is mapped. If it is not
1845 * uptodate, and it is only partially being written to, we need
1846 * to read the buffer in before the write, i.e. right now.
1847 */
1848 if (!buffer_uptodate(bh) &&
1849 (block_start < from || block_end > to)) {
1850 ll_rw_block(READ, 1, &bh);
1851 *wait_bh++ = bh;
1852 }
1853 } while (block++, block_start = block_end,
1854 (bh = bh->b_this_page) != head);
1855
1856 /* Release the lock if we took it. */
1857 if (rl) {
1858 up_read(&ni->runlist.lock);
1859 rl = NULL;
1860 }
1861
1862 /* If we issued read requests, let them complete. */
1863 while (wait_bh > wait) {
1864 wait_on_buffer(*--wait_bh);
1865 if (!buffer_uptodate(*wait_bh))
1866 return -EIO;
1867 }
1868
1869 ntfs_debug("Done.");
1870 return 0;
1871err_out:
1872 /*
1873 * Zero out any newly allocated blocks to avoid exposing stale data.
1874 * If BH_New is set, we know that the block was newly allocated in the
1875 * above loop.
1876 * FIXME: What about initialized_size increments? Have we done all the
1877 * required zeroing above? If not this error handling is broken, and
1878 * in particular the if (block_end <= from) check is completely bogus.
1879 */
1880 bh = head;
1881 block_start = 0;
1882 is_retry = FALSE;
1883 do {
1884 block_end = block_start + blocksize;
1885 if (block_end <= from)
1886 continue;
1887 if (block_start >= to)
1888 break;
1889 if (buffer_new(bh)) {
1890 void *kaddr;
1891
1892 clear_buffer_new(bh);
1893 kaddr = kmap_atomic(page, KM_USER0);
1894 memset(kaddr + block_start, 0, bh->b_size);
1895 kunmap_atomic(kaddr, KM_USER0);
1896 set_buffer_uptodate(bh);
1897 mark_buffer_dirty(bh);
1898 is_retry = TRUE;
1899 }
1900 } while (block_start = block_end, (bh = bh->b_this_page) != head);
1901 if (is_retry)
1902 flush_dcache_page(page);
1903 if (rl)
1904 up_read(&ni->runlist.lock);
1905 return err;
1906}
1907
1908/**
1909 * ntfs_prepare_write - prepare a page for receiving data
1910 *
1911 * This is called from generic_file_write() with i_sem held on the inode
1912 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
1913 * data has not yet been copied into the @page.
1914 *
1915 * Need to extend the attribute/fill in holes if necessary, create blocks and
1916 * make partially overwritten blocks uptodate,
1917 *
1918 * i_size is not to be modified yet.
1919 *
1920 * Return 0 on success or -errno on error.
1921 *
1922 * Should be using block_prepare_write() [support for sparse files] or
1923 * cont_prepare_write() [no support for sparse files]. Cannot do that due to
1924 * ntfs specifics but can look at them for implementation guidance.
1925 *
1926 * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
1927 * the first byte in the page that will be written to and @to is the first byte
1928 * after the last byte that will be written to.
1929 */
1930static int ntfs_prepare_write(struct file *file, struct page *page,
1931 unsigned from, unsigned to)
1932{
1933 s64 new_size;
1934 loff_t i_size;
1935 struct inode *vi = page->mapping->host;
1936 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1937 ntfs_volume *vol = ni->vol;
1938 ntfs_attr_search_ctx *ctx = NULL;
1939 MFT_RECORD *m = NULL;
1940 ATTR_RECORD *a;
1941 u8 *kaddr;
1942 u32 attr_len;
1943 int err;
1944
1945 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1946 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1947 page->index, from, to);
1948 BUG_ON(!PageLocked(page));
1949 BUG_ON(from > PAGE_CACHE_SIZE);
1950 BUG_ON(to > PAGE_CACHE_SIZE);
1951 BUG_ON(from > to);
1952 BUG_ON(NInoMstProtected(ni));
1953 /*
1954 * If a previous ntfs_truncate() failed, repeat it and abort if it
1955 * fails again.
1956 */
1957 if (unlikely(NInoTruncateFailed(ni))) {
1958 down_write(&vi->i_alloc_sem);
1959 err = ntfs_truncate(vi);
1960 up_write(&vi->i_alloc_sem);
1961 if (err || NInoTruncateFailed(ni)) {
1962 if (!err)
1963 err = -EIO;
1964 goto err_out;
1965 }
1966 }
1967 /* If the attribute is not resident, deal with it elsewhere. */
1968 if (NInoNonResident(ni)) {
1969 /*
1970 * Only unnamed $DATA attributes can be compressed, encrypted,
1971 * and/or sparse.
1972 */
1973 if (ni->type == AT_DATA && !ni->name_len) {
1974 /* If file is encrypted, deny access, just like NT4. */
1975 if (NInoEncrypted(ni)) {
1976 ntfs_debug("Denying write access to encrypted "
1977 "file.");
1978 return -EACCES;
1979 }
1980 /* Compressed data streams are handled in compress.c. */
1981 if (NInoCompressed(ni)) {
1982 // TODO: Implement and replace this check with
1983 // return ntfs_write_compressed_block(page);
1984 ntfs_error(vi->i_sb, "Writing to compressed "
1985 "files is not supported yet. "
1986 "Sorry.");
1987 return -EOPNOTSUPP;
1988 }
1989 // TODO: Implement and remove this check.
1990 if (NInoSparse(ni)) {
1991 ntfs_error(vi->i_sb, "Writing to sparse files "
1992 "is not supported yet. Sorry.");
1993 return -EOPNOTSUPP;
1994 }
1995 }
1996 /* Normal data stream. */
1997 return ntfs_prepare_nonresident_write(page, from, to);
1998 }
1999 /*
2000 * Attribute is resident, implying it is not compressed, encrypted, or
2001 * sparse.
2002 */
2003 BUG_ON(page_has_buffers(page));
2004 new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
2005 /* If we do not need to resize the attribute allocation we are done. */
2006 if (new_size <= i_size_read(vi))
2007 goto done;
2008 /* Map, pin, and lock the (base) mft record. */
2009 if (!NInoAttr(ni))
2010 base_ni = ni;
2011 else
2012 base_ni = ni->ext.base_ntfs_ino;
2013 m = map_mft_record(base_ni);
2014 if (IS_ERR(m)) {
2015 err = PTR_ERR(m);
2016 m = NULL;
2017 ctx = NULL;
2018 goto err_out;
2019 }
2020 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2021 if (unlikely(!ctx)) {
2022 err = -ENOMEM;
2023 goto err_out;
2024 }
2025 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2026 CASE_SENSITIVE, 0, NULL, 0, ctx);
2027 if (unlikely(err)) {
2028 if (err == -ENOENT)
2029 err = -EIO;
2030 goto err_out;
2031 }
2032 m = ctx->mrec;
2033 a = ctx->attr;
2034 /* The total length of the attribute value. */
2035 attr_len = le32_to_cpu(a->data.resident.value_length);
2036 /* Fix an eventual previous failure of ntfs_commit_write(). */
2037 i_size = i_size_read(vi);
2038 if (unlikely(attr_len > i_size)) {
2039 attr_len = i_size;
2040 a->data.resident.value_length = cpu_to_le32(attr_len);
2041 }
2042 /* If we do not need to resize the attribute allocation we are done. */
2043 if (new_size <= attr_len)
2044 goto done_unm;
2045 /* Check if new size is allowed in $AttrDef. */
2046 err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
2047 if (unlikely(err)) {
2048 if (err == -ERANGE) {
2049 ntfs_error(vol->sb, "Write would cause the inode "
2050 "0x%lx to exceed the maximum size for "
2051 "its attribute type (0x%x). Aborting "
2052 "write.", vi->i_ino,
2053 le32_to_cpu(ni->type));
2054 } else {
2055 ntfs_error(vol->sb, "Inode 0x%lx has unknown "
2056 "attribute type 0x%x. Aborting "
2057 "write.", vi->i_ino,
2058 le32_to_cpu(ni->type));
2059 err = -EIO;
2060 }
2061 goto err_out2;
2062 }
2063 /*
2064 * Extend the attribute record to be able to store the new attribute
2065 * size.
2066 */
2067 if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
2068 le16_to_cpu(a->data.resident.value_offset) +
2069 new_size)) {
2070 /* Not enough space in the mft record. */
2071 ntfs_error(vol->sb, "Not enough space in the mft record for "
2072 "the resized attribute value. This is not "
2073 "supported yet. Aborting write.");
2074 err = -EOPNOTSUPP;
2075 goto err_out2;
2076 }
2077 /*
2078 * We have enough space in the mft record to fit the write. This
2079 * implies the attribute is smaller than the mft record and hence the
2080 * attribute must be in a single page and hence page->index must be 0.
2081 */
2082 BUG_ON(page->index);
2083 /*
2084 * If the beginning of the write is past the old size, enlarge the
2085 * attribute value up to the beginning of the write and fill it with
2086 * zeroes.
2087 */
2088 if (from > attr_len) {
2089 memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
2090 attr_len, 0, from - attr_len);
2091 a->data.resident.value_length = cpu_to_le32(from);
2092 /* Zero the corresponding area in the page as well. */
2093 if (PageUptodate(page)) {
2094 kaddr = kmap_atomic(page, KM_USER0);
2095 memset(kaddr + attr_len, 0, from - attr_len);
2096 kunmap_atomic(kaddr, KM_USER0);
2097 flush_dcache_page(page);
2098 }
2099 }
2100 flush_dcache_mft_record_page(ctx->ntfs_ino);
2101 mark_mft_record_dirty(ctx->ntfs_ino);
2102done_unm:
2103 ntfs_attr_put_search_ctx(ctx);
2104 unmap_mft_record(base_ni);
2105 /*
2106 * Because resident attributes are handled by memcpy() to/from the
2107 * corresponding MFT record, and because this form of i/o is byte
2108 * aligned rather than block aligned, there is no need to bring the
2109 * page uptodate here as in the non-resident case where we need to
2110 * bring the buffers straddled by the write uptodate before
2111 * generic_file_write() does the copying from userspace.
2112 *
2113 * We thus defer the uptodate bringing of the page region outside the
2114 * region written to to ntfs_commit_write(), which makes the code
2115 * simpler and saves one atomic kmap which is good.
2116 */
2117done:
2118 ntfs_debug("Done.");
2119 return 0;
2120err_out:
2121 if (err == -ENOMEM)
2122 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2123 "prepare the write.");
2124 else {
2125 ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
2126 "with error %i.", err);
2127 NVolSetErrors(vol);
2128 make_bad_inode(vi);
2129 }
2130err_out2:
2131 if (ctx)
2132 ntfs_attr_put_search_ctx(ctx);
2133 if (m)
2134 unmap_mft_record(base_ni);
2135 return err;
2136}
2137
2138/**
2139 * ntfs_commit_nonresident_write -
2140 *
2141 */
2142static int ntfs_commit_nonresident_write(struct page *page,
2143 unsigned from, unsigned to)
2144{
2145 s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
2146 struct inode *vi = page->mapping->host;
2147 struct buffer_head *bh, *head;
2148 unsigned int block_start, block_end, blocksize;
2149 BOOL partial;
2150
2151 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2152 "0x%lx, from = %u, to = %u.", vi->i_ino,
2153 NTFS_I(vi)->type, page->index, from, to);
2154 blocksize = 1 << vi->i_blkbits;
2155
2156 // FIXME: We need a whole slew of special cases in here for compressed
2157 // files for example...
2158 // For now, we know ntfs_prepare_write() would have failed so we can't
2159 // get here in any of the cases which we have to special case, so we
2160 // are just a ripped off, unrolled generic_commit_write().
2161
2162 bh = head = page_buffers(page);
2163 block_start = 0;
2164 partial = FALSE;
2165 do {
2166 block_end = block_start + blocksize;
2167 if (block_end <= from || block_start >= to) {
2168 if (!buffer_uptodate(bh))
2169 partial = TRUE;
2170 } else {
2171 set_buffer_uptodate(bh);
2172 mark_buffer_dirty(bh);
2173 }
2174 } while (block_start = block_end, (bh = bh->b_this_page) != head);
2175 /*
2176 * If this is a partial write which happened to make all buffers
2177 * uptodate then we can optimize away a bogus ->readpage() for the next
2178 * read(). Here we 'discover' whether the page went uptodate as a
2179 * result of this (potentially partial) write.
2180 */
2181 if (!partial)
2182 SetPageUptodate(page);
2183 /*
2184 * Not convinced about this at all. See disparity comment above. For
2185 * now we know ntfs_prepare_write() would have failed in the write
2186 * exceeds i_size case, so this will never trigger which is fine.
2187 */
2188 if (pos > i_size_read(vi)) {
2189 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
2190 "not supported yet. Sorry.");
2191 return -EOPNOTSUPP;
2192 // vi->i_size = pos;
2193 // mark_inode_dirty(vi);
2194 }
2195 ntfs_debug("Done.");
2196 return 0;
2197}
2198
2199/**
2200 * ntfs_commit_write - commit the received data
2201 *
2202 * This is called from generic_file_write() with i_sem held on the inode
2203 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
2204 * data has already been copied into the @page. ntfs_prepare_write() has been
2205 * called before the data copied and it returned success so we can take the
2206 * results of various BUG checks and some error handling for granted.
2207 *
2208 * Need to mark modified blocks dirty so they get written out later when
2209 * ntfs_writepage() is invoked by the VM.
2210 *
2211 * Return 0 on success or -errno on error.
2212 *
2213 * Should be using generic_commit_write(). This marks buffers uptodate and
2214 * dirty, sets the page uptodate if all buffers in the page are uptodate, and
2215 * updates i_size if the end of io is beyond i_size. In that case, it also
2216 * marks the inode dirty.
2217 *
2218 * Cannot use generic_commit_write() due to ntfs specialities but can look at
2219 * it for implementation guidance.
2220 *
2221 * If things have gone as outlined in ntfs_prepare_write(), then we do not
2222 * need to do any page content modifications here at all, except in the write
2223 * to resident attribute case, where we need to do the uptodate bringing here
2224 * which we combine with the copying into the mft record which means we save
2225 * one atomic kmap.
2226 */
2227static int ntfs_commit_write(struct file *file, struct page *page,
2228 unsigned from, unsigned to)
2229{
2230 struct inode *vi = page->mapping->host;
2231 ntfs_inode *base_ni, *ni = NTFS_I(vi);
2232 char *kaddr, *kattr;
2233 ntfs_attr_search_ctx *ctx;
2234 MFT_RECORD *m;
2235 ATTR_RECORD *a;
2236 u32 attr_len;
2237 int err;
2238
2239 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2240 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
2241 page->index, from, to);
2242 /* If the attribute is not resident, deal with it elsewhere. */
2243 if (NInoNonResident(ni)) {
2244 /* Only unnamed $DATA attributes can be compressed/encrypted. */
2245 if (ni->type == AT_DATA && !ni->name_len) {
2246 /* Encrypted files need separate handling. */
2247 if (NInoEncrypted(ni)) {
2248 // We never get here at present!
2249 BUG();
2250 }
2251 /* Compressed data streams are handled in compress.c. */
2252 if (NInoCompressed(ni)) {
2253 // TODO: Implement this!
2254 // return ntfs_write_compressed_block(page);
2255 // We never get here at present!
2256 BUG();
2257 }
2258 }
2259 /* Normal data stream. */
2260 return ntfs_commit_nonresident_write(page, from, to);
2261 }
2262 /*
2263 * Attribute is resident, implying it is not compressed, encrypted, or
2264 * sparse.
2265 */
2266 if (!NInoAttr(ni))
2267 base_ni = ni;
2268 else
2269 base_ni = ni->ext.base_ntfs_ino;
2270 /* Map, pin, and lock the mft record. */
2271 m = map_mft_record(base_ni);
2272 if (IS_ERR(m)) {
2273 err = PTR_ERR(m);
2274 m = NULL;
2275 ctx = NULL;
2276 goto err_out;
2277 }
2278 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2279 if (unlikely(!ctx)) {
2280 err = -ENOMEM;
2281 goto err_out;
2282 }
2283 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2284 CASE_SENSITIVE, 0, NULL, 0, ctx);
2285 if (unlikely(err)) {
2286 if (err == -ENOENT)
2287 err = -EIO;
2288 goto err_out;
2289 }
2290 a = ctx->attr;
2291 /* The total length of the attribute value. */
2292 attr_len = le32_to_cpu(a->data.resident.value_length);
2293 BUG_ON(from > attr_len);
2294 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
2295 kaddr = kmap_atomic(page, KM_USER0);
2296 /* Copy the received data from the page to the mft record. */
2297 memcpy(kattr + from, kaddr + from, to - from);
2298 /* Update the attribute length if necessary. */
2299 if (to > attr_len) {
2300 attr_len = to;
2301 a->data.resident.value_length = cpu_to_le32(attr_len);
2302 }
2303 /*
2304 * If the page is not uptodate, bring the out of bounds area(s)
2305 * uptodate by copying data from the mft record to the page.
2306 */
2307 if (!PageUptodate(page)) {
2308 if (from > 0)
2309 memcpy(kaddr, kattr, from);
2310 if (to < attr_len)
2311 memcpy(kaddr + to, kattr + to, attr_len - to);
2312 /* Zero the region outside the end of the attribute value. */
2313 if (attr_len < PAGE_CACHE_SIZE)
2314 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
2315 /*
2316 * The probability of not having done any of the above is
2317 * extremely small, so we just flush unconditionally.
2318 */
2319 flush_dcache_page(page);
2320 SetPageUptodate(page);
2321 }
2322 kunmap_atomic(kaddr, KM_USER0);
2323 /* Update i_size if necessary. */
2324 if (i_size_read(vi) < attr_len) {
2325 unsigned long flags;
2326
2327 write_lock_irqsave(&ni->size_lock, flags);
2328 ni->allocated_size = ni->initialized_size = attr_len;
2329 i_size_write(vi, attr_len);
2330 write_unlock_irqrestore(&ni->size_lock, flags);
2331 }
2332 /* Mark the mft record dirty, so it gets written back. */
2333 flush_dcache_mft_record_page(ctx->ntfs_ino);
2334 mark_mft_record_dirty(ctx->ntfs_ino);
2335 ntfs_attr_put_search_ctx(ctx);
2336 unmap_mft_record(base_ni);
2337 ntfs_debug("Done.");
2338 return 0;
2339err_out:
2340 if (err == -ENOMEM) {
2341 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2342 "commit the write.");
2343 if (PageUptodate(page)) {
2344 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
2345 "dirty so the write will be retried "
2346 "later on by the VM.");
2347 /*
2348 * Put the page on mapping->dirty_pages, but leave its
2349 * buffers' dirty state as-is.
2350 */
2351 __set_page_dirty_nobuffers(page);
2352 err = 0;
2353 } else
2354 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
2355 "data has been lost.");
2356 } else {
2357 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
2358 "with error %i.", err);
2359 NVolSetErrors(ni->vol);
2360 make_bad_inode(vi);
2361 }
2362 if (ctx)
2363 ntfs_attr_put_search_ctx(ctx);
2364 if (m)
2365 unmap_mft_record(base_ni);
2366 return err;
2367}
2368
2369#endif /* NTFS_RW */ 1544#endif /* NTFS_RW */
2370 1545
2371/** 1546/**
@@ -2377,9 +1552,6 @@ struct address_space_operations ntfs_aops = {
2377 disk request queue. */ 1552 disk request queue. */
2378#ifdef NTFS_RW 1553#ifdef NTFS_RW
2379 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1554 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2380 .prepare_write = ntfs_prepare_write, /* Prepare page and buffers
2381 ready to receive data. */
2382 .commit_write = ntfs_commit_write, /* Commit received data. */
2383#endif /* NTFS_RW */ 1555#endif /* NTFS_RW */
2384}; 1556};
2385 1557
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 3f9a4ff42ee5..eda056bac256 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -21,7 +21,9 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/writeback.h>
25 27
26#include "attrib.h" 28#include "attrib.h"
27#include "debug.h" 29#include "debug.h"
@@ -36,9 +38,27 @@
36 * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode 38 * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
37 * @ni: ntfs inode for which to map (part of) a runlist 39 * @ni: ntfs inode for which to map (part of) a runlist
38 * @vcn: map runlist part containing this vcn 40 * @vcn: map runlist part containing this vcn
41 * @ctx: active attribute search context if present or NULL if not
39 * 42 *
40 * Map the part of a runlist containing the @vcn of the ntfs inode @ni. 43 * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
41 * 44 *
45 * If @ctx is specified, it is an active search context of @ni and its base mft
46 * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped
47 * runlist fragments and allows their mapping. If you do not have the mft
48 * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
49 * will perform the necessary mapping and unmapping.
50 *
51 * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
52 * restores it before returning. Thus, @ctx will be left pointing to the same
53 * attribute on return as on entry. However, the actual pointers in @ctx may
54 * point to different memory locations on return, so you must remember to reset
55 * any cached pointers from the @ctx, i.e. after the call to
56 * ntfs_map_runlist_nolock(), you will probably want to do:
57 * m = ctx->mrec;
58 * a = ctx->attr;
59 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
60 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
61 *
42 * Return 0 on success and -errno on error. There is one special error code 62 * Return 0 on success and -errno on error. There is one special error code
43 * which is not an error as such. This is -ENOENT. It means that @vcn is out 63 * which is not an error as such. This is -ENOENT. It means that @vcn is out
44 * of bounds of the runlist. 64 * of bounds of the runlist.
@@ -46,19 +66,32 @@
46 * Note the runlist can be NULL after this function returns if @vcn is zero and 66 * Note the runlist can be NULL after this function returns if @vcn is zero and
47 * the attribute has zero allocated size, i.e. there simply is no runlist. 67 * the attribute has zero allocated size, i.e. there simply is no runlist.
48 * 68 *
49 * Locking: - The runlist must be locked for writing. 69 * WARNING: If @ctx is supplied, regardless of whether success or failure is
50 * - This function modifies the runlist. 70 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
71 * is no longer valid, i.e. you need to either call
72 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
73 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
74 * why the mapping of the old inode failed.
75 *
76 * Locking: - The runlist described by @ni must be locked for writing on entry
77 * and is locked on return. Note the runlist will be modified.
78 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
79 * entry and it will be left unmapped on return.
80 * - If @ctx is not NULL, the base mft record must be mapped on entry
81 * and it will be left mapped on return.
51 */ 82 */
52int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn) 83int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
53{ 84{
54 VCN end_vcn; 85 VCN end_vcn;
86 unsigned long flags;
55 ntfs_inode *base_ni; 87 ntfs_inode *base_ni;
56 MFT_RECORD *m; 88 MFT_RECORD *m;
57 ATTR_RECORD *a; 89 ATTR_RECORD *a;
58 ntfs_attr_search_ctx *ctx;
59 runlist_element *rl; 90 runlist_element *rl;
60 unsigned long flags; 91 struct page *put_this_page = NULL;
61 int err = 0; 92 int err = 0;
93 BOOL ctx_is_temporary, ctx_needs_reset;
94 ntfs_attr_search_ctx old_ctx = { NULL, };
62 95
63 ntfs_debug("Mapping runlist part containing vcn 0x%llx.", 96 ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
64 (unsigned long long)vcn); 97 (unsigned long long)vcn);
@@ -66,20 +99,77 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
66 base_ni = ni; 99 base_ni = ni;
67 else 100 else
68 base_ni = ni->ext.base_ntfs_ino; 101 base_ni = ni->ext.base_ntfs_ino;
69 m = map_mft_record(base_ni); 102 if (!ctx) {
70 if (IS_ERR(m)) 103 ctx_is_temporary = ctx_needs_reset = TRUE;
71 return PTR_ERR(m); 104 m = map_mft_record(base_ni);
72 ctx = ntfs_attr_get_search_ctx(base_ni, m); 105 if (IS_ERR(m))
73 if (unlikely(!ctx)) { 106 return PTR_ERR(m);
74 err = -ENOMEM; 107 ctx = ntfs_attr_get_search_ctx(base_ni, m);
75 goto err_out; 108 if (unlikely(!ctx)) {
109 err = -ENOMEM;
110 goto err_out;
111 }
112 } else {
113 VCN allocated_size_vcn;
114
115 BUG_ON(IS_ERR(ctx->mrec));
116 a = ctx->attr;
117 BUG_ON(!a->non_resident);
118 ctx_is_temporary = FALSE;
119 end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
120 read_lock_irqsave(&ni->size_lock, flags);
121 allocated_size_vcn = ni->allocated_size >>
122 ni->vol->cluster_size_bits;
123 read_unlock_irqrestore(&ni->size_lock, flags);
124 if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
125 end_vcn = allocated_size_vcn - 1;
126 /*
127 * If we already have the attribute extent containing @vcn in
128 * @ctx, no need to look it up again. We slightly cheat in
129 * that if vcn exceeds the allocated size, we will refuse to
130 * map the runlist below, so there is definitely no need to get
131 * the right attribute extent.
132 */
133 if (vcn >= allocated_size_vcn || (a->type == ni->type &&
134 a->name_length == ni->name_len &&
135 !memcmp((u8*)a + le16_to_cpu(a->name_offset),
136 ni->name, ni->name_len) &&
137 sle64_to_cpu(a->data.non_resident.lowest_vcn)
138 <= vcn && end_vcn >= vcn))
139 ctx_needs_reset = FALSE;
140 else {
141 /* Save the old search context. */
142 old_ctx = *ctx;
143 /*
144 * If the currently mapped (extent) inode is not the
145 * base inode we will unmap it when we reinitialize the
146 * search context which means we need to get a
147 * reference to the page containing the mapped mft
148 * record so we do not accidentally drop changes to the
149 * mft record when it has not been marked dirty yet.
150 */
151 if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
152 old_ctx.base_ntfs_ino) {
153 put_this_page = old_ctx.ntfs_ino->page;
154 page_cache_get(put_this_page);
155 }
156 /*
157 * Reinitialize the search context so we can lookup the
158 * needed attribute extent.
159 */
160 ntfs_attr_reinit_search_ctx(ctx);
161 ctx_needs_reset = TRUE;
162 }
76 } 163 }
77 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 164 if (ctx_needs_reset) {
78 CASE_SENSITIVE, vcn, NULL, 0, ctx); 165 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
79 if (unlikely(err)) { 166 CASE_SENSITIVE, vcn, NULL, 0, ctx);
80 if (err == -ENOENT) 167 if (unlikely(err)) {
81 err = -EIO; 168 if (err == -ENOENT)
82 goto err_out; 169 err = -EIO;
170 goto err_out;
171 }
172 BUG_ON(!ctx->attr->non_resident);
83 } 173 }
84 a = ctx->attr; 174 a = ctx->attr;
85 /* 175 /*
@@ -89,11 +179,9 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
89 * ntfs_mapping_pairs_decompress() fails. 179 * ntfs_mapping_pairs_decompress() fails.
90 */ 180 */
91 end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; 181 end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
92 if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1)) { 182 if (!a->data.non_resident.lowest_vcn && end_vcn == 1)
93 read_lock_irqsave(&ni->size_lock, flags); 183 end_vcn = sle64_to_cpu(a->data.non_resident.allocated_size) >>
94 end_vcn = ni->allocated_size >> ni->vol->cluster_size_bits; 184 ni->vol->cluster_size_bits;
95 read_unlock_irqrestore(&ni->size_lock, flags);
96 }
97 if (unlikely(vcn >= end_vcn)) { 185 if (unlikely(vcn >= end_vcn)) {
98 err = -ENOENT; 186 err = -ENOENT;
99 goto err_out; 187 goto err_out;
@@ -104,9 +192,93 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
104 else 192 else
105 ni->runlist.rl = rl; 193 ni->runlist.rl = rl;
106err_out: 194err_out:
107 if (likely(ctx)) 195 if (ctx_is_temporary) {
108 ntfs_attr_put_search_ctx(ctx); 196 if (likely(ctx))
109 unmap_mft_record(base_ni); 197 ntfs_attr_put_search_ctx(ctx);
198 unmap_mft_record(base_ni);
199 } else if (ctx_needs_reset) {
200 /*
201 * If there is no attribute list, restoring the search context
202 * is acomplished simply by copying the saved context back over
203 * the caller supplied context. If there is an attribute list,
204 * things are more complicated as we need to deal with mapping
205 * of mft records and resulting potential changes in pointers.
206 */
207 if (NInoAttrList(base_ni)) {
208 /*
209 * If the currently mapped (extent) inode is not the
210 * one we had before, we need to unmap it and map the
211 * old one.
212 */
213 if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
214 /*
215 * If the currently mapped inode is not the
216 * base inode, unmap it.
217 */
218 if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
219 ctx->base_ntfs_ino) {
220 unmap_extent_mft_record(ctx->ntfs_ino);
221 ctx->mrec = ctx->base_mrec;
222 BUG_ON(!ctx->mrec);
223 }
224 /*
225 * If the old mapped inode is not the base
226 * inode, map it.
227 */
228 if (old_ctx.base_ntfs_ino &&
229 old_ctx.ntfs_ino !=
230 old_ctx.base_ntfs_ino) {
231retry_map:
232 ctx->mrec = map_mft_record(
233 old_ctx.ntfs_ino);
234 /*
235 * Something bad has happened. If out
236 * of memory retry till it succeeds.
237 * Any other errors are fatal and we
238 * return the error code in ctx->mrec.
239 * Let the caller deal with it... We
240 * just need to fudge things so the
241 * caller can reinit and/or put the
242 * search context safely.
243 */
244 if (IS_ERR(ctx->mrec)) {
245 if (PTR_ERR(ctx->mrec) ==
246 -ENOMEM) {
247 schedule();
248 goto retry_map;
249 } else
250 old_ctx.ntfs_ino =
251 old_ctx.
252 base_ntfs_ino;
253 }
254 }
255 }
256 /* Update the changed pointers in the saved context. */
257 if (ctx->mrec != old_ctx.mrec) {
258 if (!IS_ERR(ctx->mrec))
259 old_ctx.attr = (ATTR_RECORD*)(
260 (u8*)ctx->mrec +
261 ((u8*)old_ctx.attr -
262 (u8*)old_ctx.mrec));
263 old_ctx.mrec = ctx->mrec;
264 }
265 }
266 /* Restore the search context to the saved one. */
267 *ctx = old_ctx;
268 /*
269 * We drop the reference on the page we took earlier. In the
270 * case that IS_ERR(ctx->mrec) is true this means we might lose
271 * some changes to the mft record that had been made between
272 * the last time it was marked dirty/written out and now. This
273 * at this stage is not a problem as the mapping error is fatal
274 * enough that the mft record cannot be written out anyway and
275 * the caller is very likely to shutdown the whole inode
276 * immediately and mark the volume dirty for chkdsk to pick up
277 * the pieces anyway.
278 */
279 if (put_this_page)
280 page_cache_release(put_this_page);
281 }
110 return err; 282 return err;
111} 283}
112 284
@@ -122,8 +294,8 @@ err_out:
122 * of bounds of the runlist. 294 * of bounds of the runlist.
123 * 295 *
124 * Locking: - The runlist must be unlocked on entry and is unlocked on return. 296 * Locking: - The runlist must be unlocked on entry and is unlocked on return.
125 * - This function takes the runlist lock for writing and modifies the 297 * - This function takes the runlist lock for writing and may modify
126 * runlist. 298 * the runlist.
127 */ 299 */
128int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) 300int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
129{ 301{
@@ -133,7 +305,7 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
133 /* Make sure someone else didn't do the work while we were sleeping. */ 305 /* Make sure someone else didn't do the work while we were sleeping. */
134 if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= 306 if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
135 LCN_RL_NOT_MAPPED)) 307 LCN_RL_NOT_MAPPED))
136 err = ntfs_map_runlist_nolock(ni, vcn); 308 err = ntfs_map_runlist_nolock(ni, vcn, NULL);
137 up_write(&ni->runlist.lock); 309 up_write(&ni->runlist.lock);
138 return err; 310 return err;
139} 311}
@@ -212,7 +384,7 @@ retry_remap:
212 goto retry_remap; 384 goto retry_remap;
213 } 385 }
214 } 386 }
215 err = ntfs_map_runlist_nolock(ni, vcn); 387 err = ntfs_map_runlist_nolock(ni, vcn, NULL);
216 if (!write_locked) { 388 if (!write_locked) {
217 up_write(&ni->runlist.lock); 389 up_write(&ni->runlist.lock);
218 down_read(&ni->runlist.lock); 390 down_read(&ni->runlist.lock);
@@ -236,9 +408,9 @@ retry_remap:
236 408
237/** 409/**
238 * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode 410 * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
239 * @ni: ntfs inode describing the runlist to search 411 * @ni: ntfs inode describing the runlist to search
240 * @vcn: vcn to find 412 * @vcn: vcn to find
241 * @write_locked: true if the runlist is locked for writing 413 * @ctx: active attribute search context if present or NULL if not
242 * 414 *
243 * Find the virtual cluster number @vcn in the runlist described by the ntfs 415 * Find the virtual cluster number @vcn in the runlist described by the ntfs
244 * inode @ni and return the address of the runlist element containing the @vcn. 416 * inode @ni and return the address of the runlist element containing the @vcn.
@@ -246,9 +418,22 @@ retry_remap:
246 * If the @vcn is not mapped yet, the attempt is made to map the attribute 418 * If the @vcn is not mapped yet, the attempt is made to map the attribute
247 * extent containing the @vcn and the vcn to lcn conversion is retried. 419 * extent containing the @vcn and the vcn to lcn conversion is retried.
248 * 420 *
249 * If @write_locked is true the caller has locked the runlist for writing and 421 * If @ctx is specified, it is an active search context of @ni and its base mft
250 * if false for reading. 422 * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
251 * 423 * runlist fragments and allows their mapping. If you do not have the mft
424 * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
425 * will perform the necessary mapping and unmapping.
426 *
427 * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
428 * restores it before returning. Thus, @ctx will be left pointing to the same
429 * attribute on return as on entry. However, the actual pointers in @ctx may
430 * point to different memory locations on return, so you must remember to reset
431 * any cached pointers from the @ctx, i.e. after the call to
432 * ntfs_attr_find_vcn_nolock(), you will probably want to do:
433 * m = ctx->mrec;
434 * a = ctx->attr;
435 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
436 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
252 * Note you need to distinguish between the lcn of the returned runlist element 437 * Note you need to distinguish between the lcn of the returned runlist element
253 * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on 438 * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on
254 * read and allocate clusters on write. 439 * read and allocate clusters on write.
@@ -263,22 +448,31 @@ retry_remap:
263 * -ENOMEM - Not enough memory to map runlist. 448 * -ENOMEM - Not enough memory to map runlist.
264 * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). 449 * -EIO - Critical error (runlist/file is corrupt, i/o error, etc).
265 * 450 *
266 * Locking: - The runlist must be locked on entry and is left locked on return. 451 * WARNING: If @ctx is supplied, regardless of whether success or failure is
267 * - If @write_locked is FALSE, i.e. the runlist is locked for reading, 452 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
268 * the lock may be dropped inside the function so you cannot rely on 453 * is no longer valid, i.e. you need to either call
269 * the runlist still being the same when this function returns. 454 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
455 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
456 * why the mapping of the old inode failed.
457 *
458 * Locking: - The runlist described by @ni must be locked for writing on entry
459 * and is locked on return. Note the runlist may be modified when
460 * needed runlist fragments need to be mapped.
461 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
462 * entry and it will be left unmapped on return.
463 * - If @ctx is not NULL, the base mft record must be mapped on entry
464 * and it will be left mapped on return.
270 */ 465 */
271runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, 466runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
272 const BOOL write_locked) 467 ntfs_attr_search_ctx *ctx)
273{ 468{
274 unsigned long flags; 469 unsigned long flags;
275 runlist_element *rl; 470 runlist_element *rl;
276 int err = 0; 471 int err = 0;
277 BOOL is_retry = FALSE; 472 BOOL is_retry = FALSE;
278 473
279 ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", 474 ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
280 ni->mft_no, (unsigned long long)vcn, 475 ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
281 write_locked ? "write" : "read");
282 BUG_ON(!ni); 476 BUG_ON(!ni);
283 BUG_ON(!NInoNonResident(ni)); 477 BUG_ON(!NInoNonResident(ni));
284 BUG_ON(vcn < 0); 478 BUG_ON(vcn < 0);
@@ -312,33 +506,22 @@ retry_remap:
312 } 506 }
313 if (!err && !is_retry) { 507 if (!err && !is_retry) {
314 /* 508 /*
315 * The @vcn is in an unmapped region, map the runlist and 509 * If the search context is invalid we cannot map the unmapped
316 * retry. 510 * region.
317 */ 511 */
318 if (!write_locked) { 512 if (IS_ERR(ctx->mrec))
319 up_read(&ni->runlist.lock); 513 err = PTR_ERR(ctx->mrec);
320 down_write(&ni->runlist.lock); 514 else {
321 if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != 515 /*
322 LCN_RL_NOT_MAPPED)) { 516 * The @vcn is in an unmapped region, map the runlist
323 up_write(&ni->runlist.lock); 517 * and retry.
324 down_read(&ni->runlist.lock); 518 */
519 err = ntfs_map_runlist_nolock(ni, vcn, ctx);
520 if (likely(!err)) {
521 is_retry = TRUE;
325 goto retry_remap; 522 goto retry_remap;
326 } 523 }
327 } 524 }
328 err = ntfs_map_runlist_nolock(ni, vcn);
329 if (!write_locked) {
330 up_write(&ni->runlist.lock);
331 down_read(&ni->runlist.lock);
332 }
333 if (likely(!err)) {
334 is_retry = TRUE;
335 goto retry_remap;
336 }
337 /*
338 * -EINVAL coming from a failed mapping attempt is equivalent
339 * to i/o error for us as it should not happen in our code
340 * paths.
341 */
342 if (err == -EINVAL) 525 if (err == -EINVAL)
343 err = -EIO; 526 err = -EIO;
344 } else if (!err) 527 } else if (!err)
@@ -1011,6 +1194,7 @@ int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
1011 ntfs_inode *base_ni; 1194 ntfs_inode *base_ni;
1012 1195
1013 ntfs_debug("Entering."); 1196 ntfs_debug("Entering.");
1197 BUG_ON(IS_ERR(ctx->mrec));
1014 if (ctx->base_ntfs_ino) 1198 if (ctx->base_ntfs_ino)
1015 base_ni = ctx->base_ntfs_ino; 1199 base_ni = ctx->base_ntfs_ino;
1016 else 1200 else
@@ -1227,7 +1411,7 @@ int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
1227 */ 1411 */
1228int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) 1412int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
1229{ 1413{
1230 if (type == AT_INDEX_ALLOCATION || type == AT_EA) 1414 if (type == AT_INDEX_ALLOCATION)
1231 return -EPERM; 1415 return -EPERM;
1232 return 0; 1416 return 0;
1233} 1417}
@@ -1319,10 +1503,17 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
1319/** 1503/**
1320 * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute 1504 * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
1321 * @ni: ntfs inode describing the attribute to convert 1505 * @ni: ntfs inode describing the attribute to convert
1506 * @data_size: size of the resident data to copy to the non-resident attribute
1322 * 1507 *
1323 * Convert the resident ntfs attribute described by the ntfs inode @ni to a 1508 * Convert the resident ntfs attribute described by the ntfs inode @ni to a
1324 * non-resident one. 1509 * non-resident one.
1325 * 1510 *
1511 * @data_size must be equal to the attribute value size. This is needed since
1512 * we need to know the size before we can map the mft record and our callers
1513 * always know it. The reason we cannot simply read the size from the vfs
1514 * inode i_size is that this is not necessarily uptodate. This happens when
1515 * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
1516 *
1326 * Return 0 on success and -errno on error. The following error return codes 1517 * Return 0 on success and -errno on error. The following error return codes
1327 * are defined: 1518 * are defined:
1328 * -EPERM - The attribute is not allowed to be non-resident. 1519 * -EPERM - The attribute is not allowed to be non-resident.
@@ -1343,7 +1534,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
1343 * 1534 *
1344 * Locking: - The caller must hold i_sem on the inode. 1535 * Locking: - The caller must hold i_sem on the inode.
1345 */ 1536 */
1346int ntfs_attr_make_non_resident(ntfs_inode *ni) 1537int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
1347{ 1538{
1348 s64 new_size; 1539 s64 new_size;
1349 struct inode *vi = VFS_I(ni); 1540 struct inode *vi = VFS_I(ni);
@@ -1381,11 +1572,9 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1381 * The size needs to be aligned to a cluster boundary for allocation 1572 * The size needs to be aligned to a cluster boundary for allocation
1382 * purposes. 1573 * purposes.
1383 */ 1574 */
1384 new_size = (i_size_read(vi) + vol->cluster_size - 1) & 1575 new_size = (data_size + vol->cluster_size - 1) &
1385 ~(vol->cluster_size - 1); 1576 ~(vol->cluster_size - 1);
1386 if (new_size > 0) { 1577 if (new_size > 0) {
1387 runlist_element *rl2;
1388
1389 /* 1578 /*
1390 * Will need the page later and since the page lock nests 1579 * Will need the page later and since the page lock nests
1391 * outside all ntfs locks, we need to get the page now. 1580 * outside all ntfs locks, we need to get the page now.
@@ -1396,7 +1585,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1396 return -ENOMEM; 1585 return -ENOMEM;
1397 /* Start by allocating clusters to hold the attribute value. */ 1586 /* Start by allocating clusters to hold the attribute value. */
1398 rl = ntfs_cluster_alloc(vol, 0, new_size >> 1587 rl = ntfs_cluster_alloc(vol, 0, new_size >>
1399 vol->cluster_size_bits, -1, DATA_ZONE); 1588 vol->cluster_size_bits, -1, DATA_ZONE, TRUE);
1400 if (IS_ERR(rl)) { 1589 if (IS_ERR(rl)) {
1401 err = PTR_ERR(rl); 1590 err = PTR_ERR(rl);
1402 ntfs_debug("Failed to allocate cluster%s, error code " 1591 ntfs_debug("Failed to allocate cluster%s, error code "
@@ -1405,12 +1594,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1405 err); 1594 err);
1406 goto page_err_out; 1595 goto page_err_out;
1407 } 1596 }
1408 /* Change the runlist terminator to LCN_ENOENT. */
1409 rl2 = rl;
1410 while (rl2->length)
1411 rl2++;
1412 BUG_ON(rl2->lcn != LCN_RL_NOT_MAPPED);
1413 rl2->lcn = LCN_ENOENT;
1414 } else { 1597 } else {
1415 rl = NULL; 1598 rl = NULL;
1416 page = NULL; 1599 page = NULL;
@@ -1473,7 +1656,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1473 * attribute value. 1656 * attribute value.
1474 */ 1657 */
1475 attr_size = le32_to_cpu(a->data.resident.value_length); 1658 attr_size = le32_to_cpu(a->data.resident.value_length);
1476 BUG_ON(attr_size != i_size_read(vi)); 1659 BUG_ON(attr_size != data_size);
1477 if (page && !PageUptodate(page)) { 1660 if (page && !PageUptodate(page)) {
1478 kaddr = kmap_atomic(page, KM_USER0); 1661 kaddr = kmap_atomic(page, KM_USER0);
1479 memcpy(kaddr, (u8*)a + 1662 memcpy(kaddr, (u8*)a +
@@ -1538,7 +1721,9 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1538 ffs(ni->itype.compressed.block_size) - 1; 1721 ffs(ni->itype.compressed.block_size) - 1;
1539 ni->itype.compressed.block_clusters = 1U << 1722 ni->itype.compressed.block_clusters = 1U <<
1540 a->data.non_resident.compression_unit; 1723 a->data.non_resident.compression_unit;
1541 } 1724 vi->i_blocks = ni->itype.compressed.size >> 9;
1725 } else
1726 vi->i_blocks = ni->allocated_size >> 9;
1542 write_unlock_irqrestore(&ni->size_lock, flags); 1727 write_unlock_irqrestore(&ni->size_lock, flags);
1543 /* 1728 /*
1544 * This needs to be last since the address space operations ->readpage 1729 * This needs to be last since the address space operations ->readpage
@@ -1652,6 +1837,640 @@ page_err_out:
1652} 1837}
1653 1838
1654/** 1839/**
1840 * ntfs_attr_extend_allocation - extend the allocated space of an attribute
1841 * @ni: ntfs inode of the attribute whose allocation to extend
1842 * @new_alloc_size: new size in bytes to which to extend the allocation to
1843 * @new_data_size: new size in bytes to which to extend the data to
1844 * @data_start: beginning of region which is required to be non-sparse
1845 *
1846 * Extend the allocated space of an attribute described by the ntfs inode @ni
1847 * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be
1848 * implemented as a hole in the file (as long as both the volume and the ntfs
1849 * inode @ni have sparse support enabled). If @data_start is >= 0, then the
1850 * region between the old allocated size and @data_start - 1 may be made sparse
1851 * but the regions between @data_start and @new_alloc_size must be backed by
1852 * actual clusters.
1853 *
1854 * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size
1855 * of the attribute is extended to @new_data_size. Note that the i_size of the
1856 * vfs inode is not updated. Only the data size in the base attribute record
1857 * is updated. The caller has to update i_size separately if this is required.
1858 * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
1859 * size as well as for @new_data_size to be greater than @new_alloc_size.
1860 *
1861 * For resident attributes this involves resizing the attribute record and if
1862 * necessary moving it and/or other attributes into extent mft records and/or
1863 * converting the attribute to a non-resident attribute which in turn involves
1864 * extending the allocation of a non-resident attribute as described below.
1865 *
1866 * For non-resident attributes this involves allocating clusters in the data
1867 * zone on the volume (except for regions that are being made sparse) and
1868 * extending the run list to describe the allocated clusters as well as
1869 * updating the mapping pairs array of the attribute. This in turn involves
1870 * resizing the attribute record and if necessary moving it and/or other
1871 * attributes into extent mft records and/or splitting the attribute record
1872 * into multiple extent attribute records.
1873 *
1874 * Also, the attribute list attribute is updated if present and in some of the
1875 * above cases (the ones where extent mft records/attributes come into play),
1876 * an attribute list attribute is created if not already present.
1877 *
1878 * Return the new allocated size on success and -errno on error. In the case
1879 * that an error is encountered but a partial extension at least up to
1880 * @data_start (if present) is possible, the allocation is partially extended
1881 * and this is returned. This means the caller must check the returned size to
1882 * determine if the extension was partial. If @data_start is -1 then partial
1883 * allocations are not performed.
1884 *
1885 * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
1886 *
1887 * Locking: This function takes the runlist lock of @ni for writing as well as
1888 * locking the mft record of the base ntfs inode. These locks are maintained
1889 * throughout execution of the function. These locks are required so that the
1890 * attribute can be resized safely and so that it can for example be converted
1891 * from resident to non-resident safely.
1892 *
1893 * TODO: At present attribute list attribute handling is not implemented.
1894 *
1895 * TODO: At present it is not safe to call this function for anything other
1896 * than the $DATA attribute(s) of an uncompressed and unencrypted file.
1897 */
1898s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
1899 const s64 new_data_size, const s64 data_start)
1900{
1901 VCN vcn;
1902 s64 ll, allocated_size, start = data_start;
1903 struct inode *vi = VFS_I(ni);
1904 ntfs_volume *vol = ni->vol;
1905 ntfs_inode *base_ni;
1906 MFT_RECORD *m;
1907 ATTR_RECORD *a;
1908 ntfs_attr_search_ctx *ctx;
1909 runlist_element *rl, *rl2;
1910 unsigned long flags;
1911 int err, mp_size;
1912 u32 attr_len = 0; /* Silence stupid gcc warning. */
1913 BOOL mp_rebuilt;
1914
1915#ifdef NTFS_DEBUG
1916 read_lock_irqsave(&ni->size_lock, flags);
1917 allocated_size = ni->allocated_size;
1918 read_unlock_irqrestore(&ni->size_lock, flags);
1919 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1920 "old_allocated_size 0x%llx, "
1921 "new_allocated_size 0x%llx, new_data_size 0x%llx, "
1922 "data_start 0x%llx.", vi->i_ino,
1923 (unsigned)le32_to_cpu(ni->type),
1924 (unsigned long long)allocated_size,
1925 (unsigned long long)new_alloc_size,
1926 (unsigned long long)new_data_size,
1927 (unsigned long long)start);
1928#endif
1929retry_extend:
1930 /*
1931 * For non-resident attributes, @start and @new_size need to be aligned
1932 * to cluster boundaries for allocation purposes.
1933 */
1934 if (NInoNonResident(ni)) {
1935 if (start > 0)
1936 start &= ~(s64)vol->cluster_size_mask;
1937 new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
1938 ~(s64)vol->cluster_size_mask;
1939 }
1940 BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
1941 /* Check if new size is allowed in $AttrDef. */
1942 err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
1943 if (unlikely(err)) {
1944 /* Only emit errors when the write will fail completely. */
1945 read_lock_irqsave(&ni->size_lock, flags);
1946 allocated_size = ni->allocated_size;
1947 read_unlock_irqrestore(&ni->size_lock, flags);
1948 if (start < 0 || start >= allocated_size) {
1949 if (err == -ERANGE) {
1950 ntfs_error(vol->sb, "Cannot extend allocation "
1951 "of inode 0x%lx, attribute "
1952 "type 0x%x, because the new "
1953 "allocation would exceed the "
1954 "maximum allowed size for "
1955 "this attribute type.",
1956 vi->i_ino, (unsigned)
1957 le32_to_cpu(ni->type));
1958 } else {
1959 ntfs_error(vol->sb, "Cannot extend allocation "
1960 "of inode 0x%lx, attribute "
1961 "type 0x%x, because this "
1962 "attribute type is not "
1963 "defined on the NTFS volume. "
1964 "Possible corruption! You "
1965 "should run chkdsk!",
1966 vi->i_ino, (unsigned)
1967 le32_to_cpu(ni->type));
1968 }
1969 }
1970 /* Translate error code to be POSIX conformant for write(2). */
1971 if (err == -ERANGE)
1972 err = -EFBIG;
1973 else
1974 err = -EIO;
1975 return err;
1976 }
1977 if (!NInoAttr(ni))
1978 base_ni = ni;
1979 else
1980 base_ni = ni->ext.base_ntfs_ino;
1981 /*
1982 * We will be modifying both the runlist (if non-resident) and the mft
1983 * record so lock them both down.
1984 */
1985 down_write(&ni->runlist.lock);
1986 m = map_mft_record(base_ni);
1987 if (IS_ERR(m)) {
1988 err = PTR_ERR(m);
1989 m = NULL;
1990 ctx = NULL;
1991 goto err_out;
1992 }
1993 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1994 if (unlikely(!ctx)) {
1995 err = -ENOMEM;
1996 goto err_out;
1997 }
1998 read_lock_irqsave(&ni->size_lock, flags);
1999 allocated_size = ni->allocated_size;
2000 read_unlock_irqrestore(&ni->size_lock, flags);
2001 /*
2002 * If non-resident, seek to the last extent. If resident, there is
2003 * only one extent, so seek to that.
2004 */
2005 vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
2006 0;
2007 /*
2008 * Abort if someone did the work whilst we waited for the locks. If we
2009 * just converted the attribute from resident to non-resident it is
2010 * likely that exactly this has happened already. We cannot quite
2011 * abort if we need to update the data size.
2012 */
2013 if (unlikely(new_alloc_size <= allocated_size)) {
2014 ntfs_debug("Allocated size already exceeds requested size.");
2015 new_alloc_size = allocated_size;
2016 if (new_data_size < 0)
2017 goto done;
2018 /*
2019 * We want the first attribute extent so that we can update the
2020 * data size.
2021 */
2022 vcn = 0;
2023 }
2024 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2025 CASE_SENSITIVE, vcn, NULL, 0, ctx);
2026 if (unlikely(err)) {
2027 if (err == -ENOENT)
2028 err = -EIO;
2029 goto err_out;
2030 }
2031 m = ctx->mrec;
2032 a = ctx->attr;
2033 /* Use goto to reduce indentation. */
2034 if (a->non_resident)
2035 goto do_non_resident_extend;
2036 BUG_ON(NInoNonResident(ni));
2037 /* The total length of the attribute value. */
2038 attr_len = le32_to_cpu(a->data.resident.value_length);
2039 /*
2040 * Extend the attribute record to be able to store the new attribute
2041 * size. ntfs_attr_record_resize() will not do anything if the size is
2042 * not changing.
2043 */
2044 if (new_alloc_size < vol->mft_record_size &&
2045 !ntfs_attr_record_resize(m, a,
2046 le16_to_cpu(a->data.resident.value_offset) +
2047 new_alloc_size)) {
2048 /* The resize succeeded! */
2049 write_lock_irqsave(&ni->size_lock, flags);
2050 ni->allocated_size = le32_to_cpu(a->length) -
2051 le16_to_cpu(a->data.resident.value_offset);
2052 write_unlock_irqrestore(&ni->size_lock, flags);
2053 if (new_data_size >= 0) {
2054 BUG_ON(new_data_size < attr_len);
2055 a->data.resident.value_length =
2056 cpu_to_le32((u32)new_data_size);
2057 }
2058 goto flush_done;
2059 }
2060 /*
2061 * We have to drop all the locks so we can call
2062 * ntfs_attr_make_non_resident(). This could be optimised by try-
2063 * locking the first page cache page and only if that fails dropping
2064 * the locks, locking the page, and redoing all the locking and
2065 * lookups. While this would be a huge optimisation, it is not worth
2066 * it as this is definitely a slow code path.
2067 */
2068 ntfs_attr_put_search_ctx(ctx);
2069 unmap_mft_record(base_ni);
2070 up_write(&ni->runlist.lock);
2071 /*
2072 * Not enough space in the mft record, try to make the attribute
2073 * non-resident and if successful restart the extension process.
2074 */
2075 err = ntfs_attr_make_non_resident(ni, attr_len);
2076 if (likely(!err))
2077 goto retry_extend;
2078 /*
2079 * Could not make non-resident. If this is due to this not being
2080 * permitted for this attribute type or there not being enough space,
2081 * try to make other attributes non-resident. Otherwise fail.
2082 */
2083 if (unlikely(err != -EPERM && err != -ENOSPC)) {
2084 /* Only emit errors when the write will fail completely. */
2085 read_lock_irqsave(&ni->size_lock, flags);
2086 allocated_size = ni->allocated_size;
2087 read_unlock_irqrestore(&ni->size_lock, flags);
2088 if (start < 0 || start >= allocated_size)
2089 ntfs_error(vol->sb, "Cannot extend allocation of "
2090 "inode 0x%lx, attribute type 0x%x, "
2091 "because the conversion from resident "
2092 "to non-resident attribute failed "
2093 "with error code %i.", vi->i_ino,
2094 (unsigned)le32_to_cpu(ni->type), err);
2095 if (err != -ENOMEM)
2096 err = -EIO;
2097 goto conv_err_out;
2098 }
2099 /* TODO: Not implemented from here, abort. */
2100 read_lock_irqsave(&ni->size_lock, flags);
2101 allocated_size = ni->allocated_size;
2102 read_unlock_irqrestore(&ni->size_lock, flags);
2103 if (start < 0 || start >= allocated_size) {
2104 if (err == -ENOSPC)
2105 ntfs_error(vol->sb, "Not enough space in the mft "
2106 "record/on disk for the non-resident "
2107 "attribute value. This case is not "
2108 "implemented yet.");
2109 else /* if (err == -EPERM) */
2110 ntfs_error(vol->sb, "This attribute type may not be "
2111 "non-resident. This case is not "
2112 "implemented yet.");
2113 }
2114 err = -EOPNOTSUPP;
2115 goto conv_err_out;
2116#if 0
2117 // TODO: Attempt to make other attributes non-resident.
2118 if (!err)
2119 goto do_resident_extend;
2120 /*
2121 * Both the attribute list attribute and the standard information
2122 * attribute must remain in the base inode. Thus, if this is one of
2123 * these attributes, we have to try to move other attributes out into
2124 * extent mft records instead.
2125 */
2126 if (ni->type == AT_ATTRIBUTE_LIST ||
2127 ni->type == AT_STANDARD_INFORMATION) {
2128 // TODO: Attempt to move other attributes into extent mft
2129 // records.
2130 err = -EOPNOTSUPP;
2131 if (!err)
2132 goto do_resident_extend;
2133 goto err_out;
2134 }
2135 // TODO: Attempt to move this attribute to an extent mft record, but
2136 // only if it is not already the only attribute in an mft record in
2137 // which case there would be nothing to gain.
2138 err = -EOPNOTSUPP;
2139 if (!err)
2140 goto do_resident_extend;
2141 /* There is nothing we can do to make enough space. )-: */
2142 goto err_out;
2143#endif
2144do_non_resident_extend:
2145 BUG_ON(!NInoNonResident(ni));
2146 if (new_alloc_size == allocated_size) {
2147 BUG_ON(vcn);
2148 goto alloc_done;
2149 }
2150 /*
2151 * If the data starts after the end of the old allocation, this is a
2152 * $DATA attribute and sparse attributes are enabled on the volume and
2153 * for this inode, then create a sparse region between the old
2154 * allocated size and the start of the data. Otherwise simply proceed
2155 * with filling the whole space between the old allocated size and the
2156 * new allocated size with clusters.
2157 */
2158 if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
2159 !NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
2160 goto skip_sparse;
2161 // TODO: This is not implemented yet. We just fill in with real
2162 // clusters for now...
2163 ntfs_debug("Inserting holes is not-implemented yet. Falling back to "
2164 "allocating real clusters instead.");
2165skip_sparse:
2166 rl = ni->runlist.rl;
2167 if (likely(rl)) {
2168 /* Seek to the end of the runlist. */
2169 while (rl->length)
2170 rl++;
2171 }
2172 /* If this attribute extent is not mapped, map it now. */
2173 if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
2174 (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
2175 (rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
2176 if (!rl && !allocated_size)
2177 goto first_alloc;
2178 rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
2179 if (IS_ERR(rl)) {
2180 err = PTR_ERR(rl);
2181 if (start < 0 || start >= allocated_size)
2182 ntfs_error(vol->sb, "Cannot extend allocation "
2183 "of inode 0x%lx, attribute "
2184 "type 0x%x, because the "
2185 "mapping of a runlist "
2186 "fragment failed with error "
2187 "code %i.", vi->i_ino,
2188 (unsigned)le32_to_cpu(ni->type),
2189 err);
2190 if (err != -ENOMEM)
2191 err = -EIO;
2192 goto err_out;
2193 }
2194 ni->runlist.rl = rl;
2195 /* Seek to the end of the runlist. */
2196 while (rl->length)
2197 rl++;
2198 }
2199 /*
2200 * We now know the runlist of the last extent is mapped and @rl is at
2201 * the end of the runlist. We want to begin allocating clusters
2202 * starting at the last allocated cluster to reduce fragmentation. If
2203 * there are no valid LCNs in the attribute we let the cluster
2204 * allocator choose the starting cluster.
2205 */
2206 /* If the last LCN is a hole or simillar seek back to last real LCN. */
2207 while (rl->lcn < 0 && rl > ni->runlist.rl)
2208 rl--;
2209first_alloc:
2210 // FIXME: Need to implement partial allocations so at least part of the
2211 // write can be performed when start >= 0. (Needed for POSIX write(2)
2212 // conformance.)
2213 rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
2214 (new_alloc_size - allocated_size) >>
2215 vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
2216 rl->lcn + rl->length : -1, DATA_ZONE, TRUE);
2217 if (IS_ERR(rl2)) {
2218 err = PTR_ERR(rl2);
2219 if (start < 0 || start >= allocated_size)
2220 ntfs_error(vol->sb, "Cannot extend allocation of "
2221 "inode 0x%lx, attribute type 0x%x, "
2222 "because the allocation of clusters "
2223 "failed with error code %i.", vi->i_ino,
2224 (unsigned)le32_to_cpu(ni->type), err);
2225 if (err != -ENOMEM && err != -ENOSPC)
2226 err = -EIO;
2227 goto err_out;
2228 }
2229 rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
2230 if (IS_ERR(rl)) {
2231 err = PTR_ERR(rl);
2232 if (start < 0 || start >= allocated_size)
2233 ntfs_error(vol->sb, "Cannot extend allocation of "
2234 "inode 0x%lx, attribute type 0x%x, "
2235 "because the runlist merge failed "
2236 "with error code %i.", vi->i_ino,
2237 (unsigned)le32_to_cpu(ni->type), err);
2238 if (err != -ENOMEM)
2239 err = -EIO;
2240 if (ntfs_cluster_free_from_rl(vol, rl2)) {
2241 ntfs_error(vol->sb, "Failed to release allocated "
2242 "cluster(s) in error code path. Run "
2243 "chkdsk to recover the lost "
2244 "cluster(s).");
2245 NVolSetErrors(vol);
2246 }
2247 ntfs_free(rl2);
2248 goto err_out;
2249 }
2250 ni->runlist.rl = rl;
2251 ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
2252 allocated_size) >> vol->cluster_size_bits);
2253 /* Find the runlist element with which the attribute extent starts. */
2254 ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
2255 rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
2256 BUG_ON(!rl2);
2257 BUG_ON(!rl2->length);
2258 BUG_ON(rl2->lcn < LCN_HOLE);
2259 mp_rebuilt = FALSE;
2260 /* Get the size for the new mapping pairs array for this extent. */
2261 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
2262 if (unlikely(mp_size <= 0)) {
2263 err = mp_size;
2264 if (start < 0 || start >= allocated_size)
2265 ntfs_error(vol->sb, "Cannot extend allocation of "
2266 "inode 0x%lx, attribute type 0x%x, "
2267 "because determining the size for the "
2268 "mapping pairs failed with error code "
2269 "%i.", vi->i_ino,
2270 (unsigned)le32_to_cpu(ni->type), err);
2271 err = -EIO;
2272 goto undo_alloc;
2273 }
2274 /* Extend the attribute record to fit the bigger mapping pairs array. */
2275 attr_len = le32_to_cpu(a->length);
2276 err = ntfs_attr_record_resize(m, a, mp_size +
2277 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
2278 if (unlikely(err)) {
2279 BUG_ON(err != -ENOSPC);
2280 // TODO: Deal with this by moving this extent to a new mft
2281 // record or by starting a new extent in a new mft record,
2282 // possibly by extending this extent partially and filling it
2283 // and creating a new extent for the remainder, or by making
2284 // other attributes non-resident and/or by moving other
2285 // attributes out of this mft record.
2286 if (start < 0 || start >= allocated_size)
2287 ntfs_error(vol->sb, "Not enough space in the mft "
2288 "record for the extended attribute "
2289 "record. This case is not "
2290 "implemented yet.");
2291 err = -EOPNOTSUPP;
2292 goto undo_alloc;
2293 }
2294 mp_rebuilt = TRUE;
2295 /* Generate the mapping pairs array directly into the attr record. */
2296 err = ntfs_mapping_pairs_build(vol, (u8*)a +
2297 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
2298 mp_size, rl2, ll, -1, NULL);
2299 if (unlikely(err)) {
2300 if (start < 0 || start >= allocated_size)
2301 ntfs_error(vol->sb, "Cannot extend allocation of "
2302 "inode 0x%lx, attribute type 0x%x, "
2303 "because building the mapping pairs "
2304 "failed with error code %i.", vi->i_ino,
2305 (unsigned)le32_to_cpu(ni->type), err);
2306 err = -EIO;
2307 goto undo_alloc;
2308 }
2309 /* Update the highest_vcn. */
2310 a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
2311 vol->cluster_size_bits) - 1);
2312 /*
2313 * We now have extended the allocated size of the attribute. Reflect
2314 * this in the ntfs_inode structure and the attribute record.
2315 */
2316 if (a->data.non_resident.lowest_vcn) {
2317 /*
2318 * We are not in the first attribute extent, switch to it, but
2319 * first ensure the changes will make it to disk later.
2320 */
2321 flush_dcache_mft_record_page(ctx->ntfs_ino);
2322 mark_mft_record_dirty(ctx->ntfs_ino);
2323 ntfs_attr_reinit_search_ctx(ctx);
2324 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2325 CASE_SENSITIVE, 0, NULL, 0, ctx);
2326 if (unlikely(err))
2327 goto restore_undo_alloc;
2328 /* @m is not used any more so no need to set it. */
2329 a = ctx->attr;
2330 }
2331 write_lock_irqsave(&ni->size_lock, flags);
2332 ni->allocated_size = new_alloc_size;
2333 a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
2334 /*
2335 * FIXME: This would fail if @ni is a directory, $MFT, or an index,
2336 * since those can have sparse/compressed set. For example can be
2337 * set compressed even though it is not compressed itself and in that
2338 * case the bit means that files are to be created compressed in the
2339 * directory... At present this is ok as this code is only called for
2340 * regular files, and only for their $DATA attribute(s).
2341 * FIXME: The calculation is wrong if we created a hole above. For now
2342 * it does not matter as we never create holes.
2343 */
2344 if (NInoSparse(ni) || NInoCompressed(ni)) {
2345 ni->itype.compressed.size += new_alloc_size - allocated_size;
2346 a->data.non_resident.compressed_size =
2347 cpu_to_sle64(ni->itype.compressed.size);
2348 vi->i_blocks = ni->itype.compressed.size >> 9;
2349 } else
2350 vi->i_blocks = new_alloc_size >> 9;
2351 write_unlock_irqrestore(&ni->size_lock, flags);
2352alloc_done:
2353 if (new_data_size >= 0) {
2354 BUG_ON(new_data_size <
2355 sle64_to_cpu(a->data.non_resident.data_size));
2356 a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
2357 }
2358flush_done:
2359 /* Ensure the changes make it to disk. */
2360 flush_dcache_mft_record_page(ctx->ntfs_ino);
2361 mark_mft_record_dirty(ctx->ntfs_ino);
2362done:
2363 ntfs_attr_put_search_ctx(ctx);
2364 unmap_mft_record(base_ni);
2365 up_write(&ni->runlist.lock);
2366 ntfs_debug("Done, new_allocated_size 0x%llx.",
2367 (unsigned long long)new_alloc_size);
2368 return new_alloc_size;
2369restore_undo_alloc:
2370 if (start < 0 || start >= allocated_size)
2371 ntfs_error(vol->sb, "Cannot complete extension of allocation "
2372 "of inode 0x%lx, attribute type 0x%x, because "
2373 "lookup of first attribute extent failed with "
2374 "error code %i.", vi->i_ino,
2375 (unsigned)le32_to_cpu(ni->type), err);
2376 if (err == -ENOENT)
2377 err = -EIO;
2378 ntfs_attr_reinit_search_ctx(ctx);
2379 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
2380 allocated_size >> vol->cluster_size_bits, NULL, 0,
2381 ctx)) {
2382 ntfs_error(vol->sb, "Failed to find last attribute extent of "
2383 "attribute in error code path. Run chkdsk to "
2384 "recover.");
2385 write_lock_irqsave(&ni->size_lock, flags);
2386 ni->allocated_size = new_alloc_size;
2387 /*
2388 * FIXME: This would fail if @ni is a directory... See above.
2389 * FIXME: The calculation is wrong if we created a hole above.
2390 * For now it does not matter as we never create holes.
2391 */
2392 if (NInoSparse(ni) || NInoCompressed(ni)) {
2393 ni->itype.compressed.size += new_alloc_size -
2394 allocated_size;
2395 vi->i_blocks = ni->itype.compressed.size >> 9;
2396 } else
2397 vi->i_blocks = new_alloc_size >> 9;
2398 write_unlock_irqrestore(&ni->size_lock, flags);
2399 ntfs_attr_put_search_ctx(ctx);
2400 unmap_mft_record(base_ni);
2401 up_write(&ni->runlist.lock);
2402 /*
2403 * The only thing that is now wrong is the allocated size of the
2404 * base attribute extent which chkdsk should be able to fix.
2405 */
2406 NVolSetErrors(vol);
2407 return err;
2408 }
2409 ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
2410 (allocated_size >> vol->cluster_size_bits) - 1);
2411undo_alloc:
2412 ll = allocated_size >> vol->cluster_size_bits;
2413 if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
2414 ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
2415 "in error code path. Run chkdsk to recover "
2416 "the lost cluster(s).");
2417 NVolSetErrors(vol);
2418 }
2419 m = ctx->mrec;
2420 a = ctx->attr;
2421 /*
2422 * If the runlist truncation fails and/or the search context is no
2423 * longer valid, we cannot resize the attribute record or build the
2424 * mapping pairs array thus we mark the inode bad so that no access to
2425 * the freed clusters can happen.
2426 */
2427 if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
2428 ntfs_error(vol->sb, "Failed to %s in error code path. Run "
2429 "chkdsk to recover.", IS_ERR(m) ?
2430 "restore attribute search context" :
2431 "truncate attribute runlist");
2432 make_bad_inode(vi);
2433 make_bad_inode(VFS_I(base_ni));
2434 NVolSetErrors(vol);
2435 } else if (mp_rebuilt) {
2436 if (ntfs_attr_record_resize(m, a, attr_len)) {
2437 ntfs_error(vol->sb, "Failed to restore attribute "
2438 "record in error code path. Run "
2439 "chkdsk to recover.");
2440 make_bad_inode(vi);
2441 make_bad_inode(VFS_I(base_ni));
2442 NVolSetErrors(vol);
2443 } else /* if (success) */ {
2444 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
2445 a->data.non_resident.
2446 mapping_pairs_offset), attr_len -
2447 le16_to_cpu(a->data.non_resident.
2448 mapping_pairs_offset), rl2, ll, -1,
2449 NULL)) {
2450 ntfs_error(vol->sb, "Failed to restore "
2451 "mapping pairs array in error "
2452 "code path. Run chkdsk to "
2453 "recover.");
2454 make_bad_inode(vi);
2455 make_bad_inode(VFS_I(base_ni));
2456 NVolSetErrors(vol);
2457 }
2458 flush_dcache_mft_record_page(ctx->ntfs_ino);
2459 mark_mft_record_dirty(ctx->ntfs_ino);
2460 }
2461 }
2462err_out:
2463 if (ctx)
2464 ntfs_attr_put_search_ctx(ctx);
2465 if (m)
2466 unmap_mft_record(base_ni);
2467 up_write(&ni->runlist.lock);
2468conv_err_out:
2469 ntfs_debug("Failed. Returning error code %i.", err);
2470 return err;
2471}
2472
2473/**
1655 * ntfs_attr_set - fill (a part of) an attribute with a byte 2474 * ntfs_attr_set - fill (a part of) an attribute with a byte
1656 * @ni: ntfs inode describing the attribute to fill 2475 * @ni: ntfs inode describing the attribute to fill
1657 * @ofs: offset inside the attribute at which to start to fill 2476 * @ofs: offset inside the attribute at which to start to fill
@@ -1773,6 +2592,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
1773 /* Finally unlock and release the page. */ 2592 /* Finally unlock and release the page. */
1774 unlock_page(page); 2593 unlock_page(page);
1775 page_cache_release(page); 2594 page_cache_release(page);
2595 balance_dirty_pages_ratelimited(mapping);
2596 cond_resched();
1776 } 2597 }
1777 /* If there is a last partial page, need to do it the slow way. */ 2598 /* If there is a last partial page, need to do it the slow way. */
1778 if (end_ofs) { 2599 if (end_ofs) {
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 0618ed6fd7b3..9074886b44ba 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -60,14 +60,15 @@ typedef struct {
60 ATTR_RECORD *base_attr; 60 ATTR_RECORD *base_attr;
61} ntfs_attr_search_ctx; 61} ntfs_attr_search_ctx;
62 62
63extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn); 63extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
64 ntfs_attr_search_ctx *ctx);
64extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); 65extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
65 66
66extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, 67extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
67 const BOOL write_locked); 68 const BOOL write_locked);
68 69
69extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, 70extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
70 const VCN vcn, const BOOL write_locked); 71 const VCN vcn, ntfs_attr_search_ctx *ctx);
71 72
72int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, 73int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
73 const u32 name_len, const IGNORE_CASE_BOOL ic, 74 const u32 name_len, const IGNORE_CASE_BOOL ic,
@@ -102,7 +103,10 @@ extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
102extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, 103extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
103 const u32 new_size); 104 const u32 new_size);
104 105
105extern int ntfs_attr_make_non_resident(ntfs_inode *ni); 106extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
107
108extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
109 const s64 new_data_size, const s64 data_start);
106 110
107extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, 111extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
108 const u8 val); 112 const u8 val);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index be9fd1dd423d..cf3e6ced2d01 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,11 +19,24 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/pagemap.h>
23#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h>
24#include <linux/pagevec.h>
25#include <linux/sched.h>
26#include <linux/swap.h>
27#include <linux/uio.h>
28#include <linux/writeback.h>
24 29
30#include <asm/page.h>
31#include <asm/uaccess.h>
32
33#include "attrib.h"
34#include "bitmap.h"
25#include "inode.h" 35#include "inode.h"
26#include "debug.h" 36#include "debug.h"
37#include "lcnalloc.h"
38#include "malloc.h"
39#include "mft.h"
27#include "ntfs.h" 40#include "ntfs.h"
28 41
29/** 42/**
@@ -56,6 +69,2184 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
56#ifdef NTFS_RW 69#ifdef NTFS_RW
57 70
58/** 71/**
72 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
73 * @ni: ntfs inode of the attribute to extend
74 * @new_init_size: requested new initialized size in bytes
75 * @cached_page: store any allocated but unused page here
76 * @lru_pvec: lru-buffering pagevec of the caller
77 *
78 * Extend the initialized size of an attribute described by the ntfs inode @ni
79 * to @new_init_size bytes. This involves zeroing any non-sparse space between
80 * the old initialized size and @new_init_size both in the page cache and on
81 * disk (if relevant complete pages are already uptodate in the page cache then
82 * these are simply marked dirty).
83 *
84 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
85 * in the resident attribute case, it is tied to the initialized size and, in
86 * the non-resident attribute case, it may not fall below the initialized size.
87 *
88 * Note that if the attribute is resident, we do not need to touch the page
89 * cache at all. This is because if the page cache page is not uptodate we
90 * bring it uptodate later, when doing the write to the mft record since we
91 * then already have the page mapped. And if the page is uptodate, the
92 * non-initialized region will already have been zeroed when the page was
93 * brought uptodate and the region may in fact already have been overwritten
94 * with new data via mmap() based writes, so we cannot just zero it. And since
95 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
96 * is unspecified, we choose not to do zeroing and thus we do not need to touch
97 * the page at all. For a more detailed explanation see ntfs_truncate() in
98 * fs/ntfs/inode.c.
99 *
100 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
101 * pages.
102 *
103 * Return 0 on success and -errno on error. In the case that an error is
104 * encountered it is possible that the initialized size will already have been
105 * incremented some way towards @new_init_size but it is guaranteed that if
106 * this is the case, the necessary zeroing will also have happened and that all
107 * metadata is self-consistent.
108 *
109 * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be
110 * held by the caller.
111 */
112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
113 struct page **cached_page, struct pagevec *lru_pvec)
114{
115 s64 old_init_size;
116 loff_t old_i_size;
117 pgoff_t index, end_index;
118 unsigned long flags;
119 struct inode *vi = VFS_I(ni);
120 ntfs_inode *base_ni;
121 MFT_RECORD *m = NULL;
122 ATTR_RECORD *a;
123 ntfs_attr_search_ctx *ctx = NULL;
124 struct address_space *mapping;
125 struct page *page = NULL;
126 u8 *kattr;
127 int err;
128 u32 attr_len;
129
130 read_lock_irqsave(&ni->size_lock, flags);
131 old_init_size = ni->initialized_size;
132 old_i_size = i_size_read(vi);
133 BUG_ON(new_init_size > ni->allocated_size);
134 read_unlock_irqrestore(&ni->size_lock, flags);
135 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
136 "old_initialized_size 0x%llx, "
137 "new_initialized_size 0x%llx, i_size 0x%llx.",
138 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
139 (unsigned long long)old_init_size,
140 (unsigned long long)new_init_size, old_i_size);
141 if (!NInoAttr(ni))
142 base_ni = ni;
143 else
144 base_ni = ni->ext.base_ntfs_ino;
145 /* Use goto to reduce indentation and we need the label below anyway. */
146 if (NInoNonResident(ni))
147 goto do_non_resident_extend;
148 BUG_ON(old_init_size != old_i_size);
149 m = map_mft_record(base_ni);
150 if (IS_ERR(m)) {
151 err = PTR_ERR(m);
152 m = NULL;
153 goto err_out;
154 }
155 ctx = ntfs_attr_get_search_ctx(base_ni, m);
156 if (unlikely(!ctx)) {
157 err = -ENOMEM;
158 goto err_out;
159 }
160 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
161 CASE_SENSITIVE, 0, NULL, 0, ctx);
162 if (unlikely(err)) {
163 if (err == -ENOENT)
164 err = -EIO;
165 goto err_out;
166 }
167 m = ctx->mrec;
168 a = ctx->attr;
169 BUG_ON(a->non_resident);
170 /* The total length of the attribute value. */
171 attr_len = le32_to_cpu(a->data.resident.value_length);
172 BUG_ON(old_i_size != (loff_t)attr_len);
173 /*
174 * Do the zeroing in the mft record and update the attribute size in
175 * the mft record.
176 */
177 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
178 memset(kattr + attr_len, 0, new_init_size - attr_len);
179 a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
180 /* Finally, update the sizes in the vfs and ntfs inodes. */
181 write_lock_irqsave(&ni->size_lock, flags);
182 i_size_write(vi, new_init_size);
183 ni->initialized_size = new_init_size;
184 write_unlock_irqrestore(&ni->size_lock, flags);
185 goto done;
186do_non_resident_extend:
187 /*
188 * If the new initialized size @new_init_size exceeds the current file
189 * size (vfs inode->i_size), we need to extend the file size to the
190 * new initialized size.
191 */
192 if (new_init_size > old_i_size) {
193 m = map_mft_record(base_ni);
194 if (IS_ERR(m)) {
195 err = PTR_ERR(m);
196 m = NULL;
197 goto err_out;
198 }
199 ctx = ntfs_attr_get_search_ctx(base_ni, m);
200 if (unlikely(!ctx)) {
201 err = -ENOMEM;
202 goto err_out;
203 }
204 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
205 CASE_SENSITIVE, 0, NULL, 0, ctx);
206 if (unlikely(err)) {
207 if (err == -ENOENT)
208 err = -EIO;
209 goto err_out;
210 }
211 m = ctx->mrec;
212 a = ctx->attr;
213 BUG_ON(!a->non_resident);
214 BUG_ON(old_i_size != (loff_t)
215 sle64_to_cpu(a->data.non_resident.data_size));
216 a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
217 flush_dcache_mft_record_page(ctx->ntfs_ino);
218 mark_mft_record_dirty(ctx->ntfs_ino);
219 /* Update the file size in the vfs inode. */
220 i_size_write(vi, new_init_size);
221 ntfs_attr_put_search_ctx(ctx);
222 ctx = NULL;
223 unmap_mft_record(base_ni);
224 m = NULL;
225 }
226 mapping = vi->i_mapping;
227 index = old_init_size >> PAGE_CACHE_SHIFT;
228 end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
229 do {
230 /*
231 * Read the page. If the page is not present, this will zero
232 * the uninitialized regions for us.
233 */
234 page = read_cache_page(mapping, index,
235 (filler_t*)mapping->a_ops->readpage, NULL);
236 if (IS_ERR(page)) {
237 err = PTR_ERR(page);
238 goto init_err_out;
239 }
240 wait_on_page_locked(page);
241 if (unlikely(!PageUptodate(page) || PageError(page))) {
242 page_cache_release(page);
243 err = -EIO;
244 goto init_err_out;
245 }
246 /*
247 * Update the initialized size in the ntfs inode. This is
248 * enough to make ntfs_writepage() work.
249 */
250 write_lock_irqsave(&ni->size_lock, flags);
251 ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT;
252 if (ni->initialized_size > new_init_size)
253 ni->initialized_size = new_init_size;
254 write_unlock_irqrestore(&ni->size_lock, flags);
255 /* Set the page dirty so it gets written out. */
256 set_page_dirty(page);
257 page_cache_release(page);
258 /*
259 * Play nice with the vm and the rest of the system. This is
260 * very much needed as we can potentially be modifying the
261 * initialised size from a very small value to a really huge
262 * value, e.g.
263 * f = open(somefile, O_TRUNC);
264 * truncate(f, 10GiB);
265 * seek(f, 10GiB);
266 * write(f, 1);
267 * And this would mean we would be marking dirty hundreds of
268 * thousands of pages or as in the above example more than
269 * two and a half million pages!
270 *
271 * TODO: For sparse pages could optimize this workload by using
272 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
273 * would be set in readpage for sparse pages and here we would
274 * not need to mark dirty any pages which have this bit set.
275 * The only caveat is that we have to clear the bit everywhere
276 * where we allocate any clusters that lie in the page or that
277 * contain the page.
278 *
279 * TODO: An even greater optimization would be for us to only
280 * call readpage() on pages which are not in sparse regions as
281 * determined from the runlist. This would greatly reduce the
282 * number of pages we read and make dirty in the case of sparse
283 * files.
284 */
285 balance_dirty_pages_ratelimited(mapping);
286 cond_resched();
287 } while (++index < end_index);
288 read_lock_irqsave(&ni->size_lock, flags);
289 BUG_ON(ni->initialized_size != new_init_size);
290 read_unlock_irqrestore(&ni->size_lock, flags);
291 /* Now bring in sync the initialized_size in the mft record. */
292 m = map_mft_record(base_ni);
293 if (IS_ERR(m)) {
294 err = PTR_ERR(m);
295 m = NULL;
296 goto init_err_out;
297 }
298 ctx = ntfs_attr_get_search_ctx(base_ni, m);
299 if (unlikely(!ctx)) {
300 err = -ENOMEM;
301 goto init_err_out;
302 }
303 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
304 CASE_SENSITIVE, 0, NULL, 0, ctx);
305 if (unlikely(err)) {
306 if (err == -ENOENT)
307 err = -EIO;
308 goto init_err_out;
309 }
310 m = ctx->mrec;
311 a = ctx->attr;
312 BUG_ON(!a->non_resident);
313 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
314done:
315 flush_dcache_mft_record_page(ctx->ntfs_ino);
316 mark_mft_record_dirty(ctx->ntfs_ino);
317 if (ctx)
318 ntfs_attr_put_search_ctx(ctx);
319 if (m)
320 unmap_mft_record(base_ni);
321 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
322 (unsigned long long)new_init_size, i_size_read(vi));
323 return 0;
324init_err_out:
325 write_lock_irqsave(&ni->size_lock, flags);
326 ni->initialized_size = old_init_size;
327 write_unlock_irqrestore(&ni->size_lock, flags);
328err_out:
329 if (ctx)
330 ntfs_attr_put_search_ctx(ctx);
331 if (m)
332 unmap_mft_record(base_ni);
333 ntfs_debug("Failed. Returning error code %i.", err);
334 return err;
335}
336
337/**
338 * ntfs_fault_in_pages_readable -
339 *
340 * Fault a number of userspace pages into pagetables.
341 *
342 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
343 * with more than two userspace pages as well as handling the single page case
344 * elegantly.
345 *
346 * If you find this difficult to understand, then think of the while loop being
347 * the following code, except that we do without the integer variable ret:
348 *
349 * do {
350 * ret = __get_user(c, uaddr);
351 * uaddr += PAGE_SIZE;
352 * } while (!ret && uaddr < end);
353 *
354 * Note, the final __get_user() may well run out-of-bounds of the user buffer,
355 * but _not_ out-of-bounds of the page the user buffer belongs to, and since
356 * this is only a read and not a write, and since it is still in the same page,
357 * it should not matter and this makes the code much simpler.
358 */
359static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
360 int bytes)
361{
362 const char __user *end;
363 volatile char c;
364
365 /* Set @end to the first byte outside the last page we care about. */
366 end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes);
367
368 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
369 ;
370}
371
372/**
373 * ntfs_fault_in_pages_readable_iovec -
374 *
375 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
376 */
377static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
378 size_t iov_ofs, int bytes)
379{
380 do {
381 const char __user *buf;
382 unsigned len;
383
384 buf = iov->iov_base + iov_ofs;
385 len = iov->iov_len - iov_ofs;
386 if (len > bytes)
387 len = bytes;
388 ntfs_fault_in_pages_readable(buf, len);
389 bytes -= len;
390 iov++;
391 iov_ofs = 0;
392 } while (bytes);
393}
394
395/**
396 * __ntfs_grab_cache_pages - obtain a number of locked pages
397 * @mapping: address space mapping from which to obtain page cache pages
398 * @index: starting index in @mapping at which to begin obtaining pages
399 * @nr_pages: number of page cache pages to obtain
400 * @pages: array of pages in which to return the obtained page cache pages
401 * @cached_page: allocated but as yet unused page
402 * @lru_pvec: lru-buffering pagevec of caller
403 *
404 * Obtain @nr_pages locked page cache pages from the mapping @maping and
405 * starting at index @index.
406 *
407 * If a page is newly created, increment its refcount and add it to the
408 * caller's lru-buffering pagevec @lru_pvec.
409 *
410 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
411 * are obtained at once instead of just one page and that 0 is returned on
412 * success and -errno on error.
413 *
414 * Note, the page locks are obtained in ascending page index order.
415 */
416static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
417 pgoff_t index, const unsigned nr_pages, struct page **pages,
418 struct page **cached_page, struct pagevec *lru_pvec)
419{
420 int err, nr;
421
422 BUG_ON(!nr_pages);
423 err = nr = 0;
424 do {
425 pages[nr] = find_lock_page(mapping, index);
426 if (!pages[nr]) {
427 if (!*cached_page) {
428 *cached_page = page_cache_alloc(mapping);
429 if (unlikely(!*cached_page)) {
430 err = -ENOMEM;
431 goto err_out;
432 }
433 }
434 err = add_to_page_cache(*cached_page, mapping, index,
435 GFP_KERNEL);
436 if (unlikely(err)) {
437 if (err == -EEXIST)
438 continue;
439 goto err_out;
440 }
441 pages[nr] = *cached_page;
442 page_cache_get(*cached_page);
443 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
444 __pagevec_lru_add(lru_pvec);
445 *cached_page = NULL;
446 }
447 index++;
448 nr++;
449 } while (nr < nr_pages);
450out:
451 return err;
452err_out:
453 while (nr > 0) {
454 unlock_page(pages[--nr]);
455 page_cache_release(pages[nr]);
456 }
457 goto out;
458}
459
460static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
461{
462 lock_buffer(bh);
463 get_bh(bh);
464 bh->b_end_io = end_buffer_read_sync;
465 return submit_bh(READ, bh);
466}
467
468/**
469 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
470 * @pages: array of destination pages
471 * @nr_pages: number of pages in @pages
472 * @pos: byte position in file at which the write begins
473 * @bytes: number of bytes to be written
474 *
475 * This is called for non-resident attributes from ntfs_file_buffered_write()
476 * with i_sem held on the inode (@pages[0]->mapping->host). There are
477 * @nr_pages pages in @pages which are locked but not kmap()ped. The source
478 * data has not yet been copied into the @pages.
479 *
480 * Need to fill any holes with actual clusters, allocate buffers if necessary,
481 * ensure all the buffers are mapped, and bring uptodate any buffers that are
482 * only partially being written to.
483 *
484 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
485 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
486 * the same cluster and that they are the entirety of that cluster, and that
487 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
488 *
489 * i_size is not to be modified yet.
490 *
491 * Return 0 on success or -errno on error.
492 */
493static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
494 unsigned nr_pages, s64 pos, size_t bytes)
495{
496 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
497 LCN lcn;
498 s64 bh_pos, vcn_len, end, initialized_size;
499 sector_t lcn_block;
500 struct page *page;
501 struct inode *vi;
502 ntfs_inode *ni, *base_ni = NULL;
503 ntfs_volume *vol;
504 runlist_element *rl, *rl2;
505 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
506 ntfs_attr_search_ctx *ctx = NULL;
507 MFT_RECORD *m = NULL;
508 ATTR_RECORD *a = NULL;
509 unsigned long flags;
510 u32 attr_rec_len = 0;
511 unsigned blocksize, u;
512 int err, mp_size;
513 BOOL rl_write_locked, was_hole, is_retry;
514 unsigned char blocksize_bits;
515 struct {
516 u8 runlist_merged:1;
517 u8 mft_attr_mapped:1;
518 u8 mp_rebuilt:1;
519 u8 attr_switched:1;
520 } status = { 0, 0, 0, 0 };
521
522 BUG_ON(!nr_pages);
523 BUG_ON(!pages);
524 BUG_ON(!*pages);
525 vi = pages[0]->mapping->host;
526 ni = NTFS_I(vi);
527 vol = ni->vol;
528 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
529 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
530 vi->i_ino, ni->type, pages[0]->index, nr_pages,
531 (long long)pos, bytes);
532 blocksize_bits = vi->i_blkbits;
533 blocksize = 1 << blocksize_bits;
534 u = 0;
535 do {
536 struct page *page = pages[u];
537 /*
538 * create_empty_buffers() will create uptodate/dirty buffers if
539 * the page is uptodate/dirty.
540 */
541 if (!page_has_buffers(page)) {
542 create_empty_buffers(page, blocksize, 0);
543 if (unlikely(!page_has_buffers(page)))
544 return -ENOMEM;
545 }
546 } while (++u < nr_pages);
547 rl_write_locked = FALSE;
548 rl = NULL;
549 err = 0;
550 vcn = lcn = -1;
551 vcn_len = 0;
552 lcn_block = -1;
553 was_hole = FALSE;
554 cpos = pos >> vol->cluster_size_bits;
555 end = pos + bytes;
556 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
557 /*
558 * Loop over each page and for each page over each buffer. Use goto to
559 * reduce indentation.
560 */
561 u = 0;
562do_next_page:
563 page = pages[u];
564 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
565 bh = head = page_buffers(page);
566 do {
567 VCN cdelta;
568 s64 bh_end;
569 unsigned bh_cofs;
570
571 /* Clear buffer_new on all buffers to reinitialise state. */
572 if (buffer_new(bh))
573 clear_buffer_new(bh);
574 bh_end = bh_pos + blocksize;
575 bh_cpos = bh_pos >> vol->cluster_size_bits;
576 bh_cofs = bh_pos & vol->cluster_size_mask;
577 if (buffer_mapped(bh)) {
578 /*
579 * The buffer is already mapped. If it is uptodate,
580 * ignore it.
581 */
582 if (buffer_uptodate(bh))
583 continue;
584 /*
585 * The buffer is not uptodate. If the page is uptodate
586 * set the buffer uptodate and otherwise ignore it.
587 */
588 if (PageUptodate(page)) {
589 set_buffer_uptodate(bh);
590 continue;
591 }
592 /*
593 * Neither the page nor the buffer are uptodate. If
594 * the buffer is only partially being written to, we
595 * need to read it in before the write, i.e. now.
596 */
597 if ((bh_pos < pos && bh_end > pos) ||
598 (bh_pos < end && bh_end > end)) {
599 /*
600 * If the buffer is fully or partially within
601 * the initialized size, do an actual read.
602 * Otherwise, simply zero the buffer.
603 */
604 read_lock_irqsave(&ni->size_lock, flags);
605 initialized_size = ni->initialized_size;
606 read_unlock_irqrestore(&ni->size_lock, flags);
607 if (bh_pos < initialized_size) {
608 ntfs_submit_bh_for_read(bh);
609 *wait_bh++ = bh;
610 } else {
611 u8 *kaddr = kmap_atomic(page, KM_USER0);
612 memset(kaddr + bh_offset(bh), 0,
613 blocksize);
614 kunmap_atomic(kaddr, KM_USER0);
615 flush_dcache_page(page);
616 set_buffer_uptodate(bh);
617 }
618 }
619 continue;
620 }
621 /* Unmapped buffer. Need to map it. */
622 bh->b_bdev = vol->sb->s_bdev;
623 /*
624 * If the current buffer is in the same clusters as the map
625 * cache, there is no need to check the runlist again. The
626 * map cache is made up of @vcn, which is the first cached file
627 * cluster, @vcn_len which is the number of cached file
628 * clusters, @lcn is the device cluster corresponding to @vcn,
629 * and @lcn_block is the block number corresponding to @lcn.
630 */
631 cdelta = bh_cpos - vcn;
632 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
633map_buffer_cached:
634 BUG_ON(lcn < 0);
635 bh->b_blocknr = lcn_block +
636 (cdelta << (vol->cluster_size_bits -
637 blocksize_bits)) +
638 (bh_cofs >> blocksize_bits);
639 set_buffer_mapped(bh);
640 /*
641 * If the page is uptodate so is the buffer. If the
642 * buffer is fully outside the write, we ignore it if
643 * it was already allocated and we mark it dirty so it
644 * gets written out if we allocated it. On the other
645 * hand, if we allocated the buffer but we are not
646 * marking it dirty we set buffer_new so we can do
647 * error recovery.
648 */
649 if (PageUptodate(page)) {
650 if (!buffer_uptodate(bh))
651 set_buffer_uptodate(bh);
652 if (unlikely(was_hole)) {
653 /* We allocated the buffer. */
654 unmap_underlying_metadata(bh->b_bdev,
655 bh->b_blocknr);
656 if (bh_end <= pos || bh_pos >= end)
657 mark_buffer_dirty(bh);
658 else
659 set_buffer_new(bh);
660 }
661 continue;
662 }
663 /* Page is _not_ uptodate. */
664 if (likely(!was_hole)) {
665 /*
666 * Buffer was already allocated. If it is not
667 * uptodate and is only partially being written
668 * to, we need to read it in before the write,
669 * i.e. now.
670 */
671 if (!buffer_uptodate(bh) && ((bh_pos < pos &&
672 bh_end > pos) ||
673 (bh_end > end &&
674 bh_end > end))) {
675 /*
676 * If the buffer is fully or partially
677 * within the initialized size, do an
678 * actual read. Otherwise, simply zero
679 * the buffer.
680 */
681 read_lock_irqsave(&ni->size_lock,
682 flags);
683 initialized_size = ni->initialized_size;
684 read_unlock_irqrestore(&ni->size_lock,
685 flags);
686 if (bh_pos < initialized_size) {
687 ntfs_submit_bh_for_read(bh);
688 *wait_bh++ = bh;
689 } else {
690 u8 *kaddr = kmap_atomic(page,
691 KM_USER0);
692 memset(kaddr + bh_offset(bh),
693 0, blocksize);
694 kunmap_atomic(kaddr, KM_USER0);
695 flush_dcache_page(page);
696 set_buffer_uptodate(bh);
697 }
698 }
699 continue;
700 }
701 /* We allocated the buffer. */
702 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
703 /*
704 * If the buffer is fully outside the write, zero it,
705 * set it uptodate, and mark it dirty so it gets
706 * written out. If it is partially being written to,
707 * zero region surrounding the write but leave it to
708 * commit write to do anything else. Finally, if the
709 * buffer is fully being overwritten, do nothing.
710 */
711 if (bh_end <= pos || bh_pos >= end) {
712 if (!buffer_uptodate(bh)) {
713 u8 *kaddr = kmap_atomic(page, KM_USER0);
714 memset(kaddr + bh_offset(bh), 0,
715 blocksize);
716 kunmap_atomic(kaddr, KM_USER0);
717 flush_dcache_page(page);
718 set_buffer_uptodate(bh);
719 }
720 mark_buffer_dirty(bh);
721 continue;
722 }
723 set_buffer_new(bh);
724 if (!buffer_uptodate(bh) &&
725 (bh_pos < pos || bh_end > end)) {
726 u8 *kaddr;
727 unsigned pofs;
728
729 kaddr = kmap_atomic(page, KM_USER0);
730 if (bh_pos < pos) {
731 pofs = bh_pos & ~PAGE_CACHE_MASK;
732 memset(kaddr + pofs, 0, pos - bh_pos);
733 }
734 if (bh_end > end) {
735 pofs = end & ~PAGE_CACHE_MASK;
736 memset(kaddr + pofs, 0, bh_end - end);
737 }
738 kunmap_atomic(kaddr, KM_USER0);
739 flush_dcache_page(page);
740 }
741 continue;
742 }
743 /*
744 * Slow path: this is the first buffer in the cluster. If it
745 * is outside allocated size and is not uptodate, zero it and
746 * set it uptodate.
747 */
748 read_lock_irqsave(&ni->size_lock, flags);
749 initialized_size = ni->allocated_size;
750 read_unlock_irqrestore(&ni->size_lock, flags);
751 if (bh_pos > initialized_size) {
752 if (PageUptodate(page)) {
753 if (!buffer_uptodate(bh))
754 set_buffer_uptodate(bh);
755 } else if (!buffer_uptodate(bh)) {
756 u8 *kaddr = kmap_atomic(page, KM_USER0);
757 memset(kaddr + bh_offset(bh), 0, blocksize);
758 kunmap_atomic(kaddr, KM_USER0);
759 flush_dcache_page(page);
760 set_buffer_uptodate(bh);
761 }
762 continue;
763 }
764 is_retry = FALSE;
765 if (!rl) {
766 down_read(&ni->runlist.lock);
767retry_remap:
768 rl = ni->runlist.rl;
769 }
770 if (likely(rl != NULL)) {
771 /* Seek to element containing target cluster. */
772 while (rl->length && rl[1].vcn <= bh_cpos)
773 rl++;
774 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
775 if (likely(lcn >= 0)) {
776 /*
777 * Successful remap, setup the map cache and
778 * use that to deal with the buffer.
779 */
780 was_hole = FALSE;
781 vcn = bh_cpos;
782 vcn_len = rl[1].vcn - vcn;
783 lcn_block = lcn << (vol->cluster_size_bits -
784 blocksize_bits);
785 cdelta = 0;
786 /*
787 * If the number of remaining clusters in the
788 * @pages is smaller or equal to the number of
789 * cached clusters, unlock the runlist as the
790 * map cache will be used from now on.
791 */
792 if (likely(vcn + vcn_len >= cend)) {
793 if (rl_write_locked) {
794 up_write(&ni->runlist.lock);
795 rl_write_locked = FALSE;
796 } else
797 up_read(&ni->runlist.lock);
798 rl = NULL;
799 }
800 goto map_buffer_cached;
801 }
802 } else
803 lcn = LCN_RL_NOT_MAPPED;
804 /*
805 * If it is not a hole and not out of bounds, the runlist is
806 * probably unmapped so try to map it now.
807 */
808 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
809 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
810 /* Attempt to map runlist. */
811 if (!rl_write_locked) {
812 /*
813 * We need the runlist locked for
814 * writing, so if it is locked for
815 * reading relock it now and retry in
816 * case it changed whilst we dropped
817 * the lock.
818 */
819 up_read(&ni->runlist.lock);
820 down_write(&ni->runlist.lock);
821 rl_write_locked = TRUE;
822 goto retry_remap;
823 }
824 err = ntfs_map_runlist_nolock(ni, bh_cpos,
825 NULL);
826 if (likely(!err)) {
827 is_retry = TRUE;
828 goto retry_remap;
829 }
830 /*
831 * If @vcn is out of bounds, pretend @lcn is
832 * LCN_ENOENT. As long as the buffer is out
833 * of bounds this will work fine.
834 */
835 if (err == -ENOENT) {
836 lcn = LCN_ENOENT;
837 err = 0;
838 goto rl_not_mapped_enoent;
839 }
840 } else
841 err = -EIO;
842 /* Failed to map the buffer, even after retrying. */
843 bh->b_blocknr = -1;
844 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
845 "attribute type 0x%x, vcn 0x%llx, "
846 "vcn offset 0x%x, because its "
847 "location on disk could not be "
848 "determined%s (error code %i).",
849 ni->mft_no, ni->type,
850 (unsigned long long)bh_cpos,
851 (unsigned)bh_pos &
852 vol->cluster_size_mask,
853 is_retry ? " even after retrying" : "",
854 err);
855 break;
856 }
857rl_not_mapped_enoent:
858 /*
859 * The buffer is in a hole or out of bounds. We need to fill
860 * the hole, unless the buffer is in a cluster which is not
861 * touched by the write, in which case we just leave the buffer
862 * unmapped. This can only happen when the cluster size is
863 * less than the page cache size.
864 */
865 if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
866 bh_cend = (bh_end + vol->cluster_size - 1) >>
867 vol->cluster_size_bits;
868 if ((bh_cend <= cpos || bh_cpos >= cend)) {
869 bh->b_blocknr = -1;
870 /*
871 * If the buffer is uptodate we skip it. If it
872 * is not but the page is uptodate, we can set
873 * the buffer uptodate. If the page is not
874 * uptodate, we can clear the buffer and set it
875 * uptodate. Whether this is worthwhile is
876 * debatable and this could be removed.
877 */
878 if (PageUptodate(page)) {
879 if (!buffer_uptodate(bh))
880 set_buffer_uptodate(bh);
881 } else if (!buffer_uptodate(bh)) {
882 u8 *kaddr = kmap_atomic(page, KM_USER0);
883 memset(kaddr + bh_offset(bh), 0,
884 blocksize);
885 kunmap_atomic(kaddr, KM_USER0);
886 flush_dcache_page(page);
887 set_buffer_uptodate(bh);
888 }
889 continue;
890 }
891 }
892 /*
893 * Out of bounds buffer is invalid if it was not really out of
894 * bounds.
895 */
896 BUG_ON(lcn != LCN_HOLE);
897 /*
898 * We need the runlist locked for writing, so if it is locked
899 * for reading relock it now and retry in case it changed
900 * whilst we dropped the lock.
901 */
902 BUG_ON(!rl);
903 if (!rl_write_locked) {
904 up_read(&ni->runlist.lock);
905 down_write(&ni->runlist.lock);
906 rl_write_locked = TRUE;
907 goto retry_remap;
908 }
909 /* Find the previous last allocated cluster. */
910 BUG_ON(rl->lcn != LCN_HOLE);
911 lcn = -1;
912 rl2 = rl;
913 while (--rl2 >= ni->runlist.rl) {
914 if (rl2->lcn >= 0) {
915 lcn = rl2->lcn + rl2->length;
916 break;
917 }
918 }
919 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
920 FALSE);
921 if (IS_ERR(rl2)) {
922 err = PTR_ERR(rl2);
923 ntfs_debug("Failed to allocate cluster, error code %i.",
924 err);
925 break;
926 }
927 lcn = rl2->lcn;
928 rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
929 if (IS_ERR(rl)) {
930 err = PTR_ERR(rl);
931 if (err != -ENOMEM)
932 err = -EIO;
933 if (ntfs_cluster_free_from_rl(vol, rl2)) {
934 ntfs_error(vol->sb, "Failed to release "
935 "allocated cluster in error "
936 "code path. Run chkdsk to "
937 "recover the lost cluster.");
938 NVolSetErrors(vol);
939 }
940 ntfs_free(rl2);
941 break;
942 }
943 ni->runlist.rl = rl;
944 status.runlist_merged = 1;
945 ntfs_debug("Allocated cluster, lcn 0x%llx.", lcn);
946 /* Map and lock the mft record and get the attribute record. */
947 if (!NInoAttr(ni))
948 base_ni = ni;
949 else
950 base_ni = ni->ext.base_ntfs_ino;
951 m = map_mft_record(base_ni);
952 if (IS_ERR(m)) {
953 err = PTR_ERR(m);
954 break;
955 }
956 ctx = ntfs_attr_get_search_ctx(base_ni, m);
957 if (unlikely(!ctx)) {
958 err = -ENOMEM;
959 unmap_mft_record(base_ni);
960 break;
961 }
962 status.mft_attr_mapped = 1;
963 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
964 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
965 if (unlikely(err)) {
966 if (err == -ENOENT)
967 err = -EIO;
968 break;
969 }
970 m = ctx->mrec;
971 a = ctx->attr;
972 /*
973 * Find the runlist element with which the attribute extent
974 * starts. Note, we cannot use the _attr_ version because we
975 * have mapped the mft record. That is ok because we know the
976 * runlist fragment must be mapped already to have ever gotten
977 * here, so we can just use the _rl_ version.
978 */
979 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
980 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
981 BUG_ON(!rl2);
982 BUG_ON(!rl2->length);
983 BUG_ON(rl2->lcn < LCN_HOLE);
984 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
985 /*
986 * If @highest_vcn is zero, calculate the real highest_vcn
987 * (which can really be zero).
988 */
989 if (!highest_vcn)
990 highest_vcn = (sle64_to_cpu(
991 a->data.non_resident.allocated_size) >>
992 vol->cluster_size_bits) - 1;
993 /*
994 * Determine the size of the mapping pairs array for the new
995 * extent, i.e. the old extent with the hole filled.
996 */
997 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
998 highest_vcn);
999 if (unlikely(mp_size <= 0)) {
1000 if (!(err = mp_size))
1001 err = -EIO;
1002 ntfs_debug("Failed to get size for mapping pairs "
1003 "array, error code %i.", err);
1004 break;
1005 }
1006 /*
1007 * Resize the attribute record to fit the new mapping pairs
1008 * array.
1009 */
1010 attr_rec_len = le32_to_cpu(a->length);
1011 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
1012 a->data.non_resident.mapping_pairs_offset));
1013 if (unlikely(err)) {
1014 BUG_ON(err != -ENOSPC);
1015 // TODO: Deal with this by using the current attribute
1016 // and fill it with as much of the mapping pairs
1017 // array as possible. Then loop over each attribute
1018 // extent rewriting the mapping pairs arrays as we go
1019 // along and if when we reach the end we have not
1020 // enough space, try to resize the last attribute
1021 // extent and if even that fails, add a new attribute
1022 // extent.
1023 // We could also try to resize at each step in the hope
1024 // that we will not need to rewrite every single extent.
1025 // Note, we may need to decompress some extents to fill
1026 // the runlist as we are walking the extents...
1027 ntfs_error(vol->sb, "Not enough space in the mft "
1028 "record for the extended attribute "
1029 "record. This case is not "
1030 "implemented yet.");
1031 err = -EOPNOTSUPP;
1032 break ;
1033 }
1034 status.mp_rebuilt = 1;
1035 /*
1036 * Generate the mapping pairs array directly into the attribute
1037 * record.
1038 */
1039 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1040 a->data.non_resident.mapping_pairs_offset),
1041 mp_size, rl2, vcn, highest_vcn, NULL);
1042 if (unlikely(err)) {
1043 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1044 "attribute type 0x%x, because building "
1045 "the mapping pairs failed with error "
1046 "code %i.", vi->i_ino,
1047 (unsigned)le32_to_cpu(ni->type), err);
1048 err = -EIO;
1049 break;
1050 }
1051 /* Update the highest_vcn but only if it was not set. */
1052 if (unlikely(!a->data.non_resident.highest_vcn))
1053 a->data.non_resident.highest_vcn =
1054 cpu_to_sle64(highest_vcn);
1055 /*
1056 * If the attribute is sparse/compressed, update the compressed
1057 * size in the ntfs_inode structure and the attribute record.
1058 */
1059 if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1060 /*
1061 * If we are not in the first attribute extent, switch
1062 * to it, but first ensure the changes will make it to
1063 * disk later.
1064 */
1065 if (a->data.non_resident.lowest_vcn) {
1066 flush_dcache_mft_record_page(ctx->ntfs_ino);
1067 mark_mft_record_dirty(ctx->ntfs_ino);
1068 ntfs_attr_reinit_search_ctx(ctx);
1069 err = ntfs_attr_lookup(ni->type, ni->name,
1070 ni->name_len, CASE_SENSITIVE,
1071 0, NULL, 0, ctx);
1072 if (unlikely(err)) {
1073 status.attr_switched = 1;
1074 break;
1075 }
1076 /* @m is not used any more so do not set it. */
1077 a = ctx->attr;
1078 }
1079 write_lock_irqsave(&ni->size_lock, flags);
1080 ni->itype.compressed.size += vol->cluster_size;
1081 a->data.non_resident.compressed_size =
1082 cpu_to_sle64(ni->itype.compressed.size);
1083 write_unlock_irqrestore(&ni->size_lock, flags);
1084 }
1085 /* Ensure the changes make it to disk. */
1086 flush_dcache_mft_record_page(ctx->ntfs_ino);
1087 mark_mft_record_dirty(ctx->ntfs_ino);
1088 ntfs_attr_put_search_ctx(ctx);
1089 unmap_mft_record(base_ni);
1090 /* Successfully filled the hole. */
1091 status.runlist_merged = 0;
1092 status.mft_attr_mapped = 0;
1093 status.mp_rebuilt = 0;
1094 /* Setup the map cache and use that to deal with the buffer. */
1095 was_hole = TRUE;
1096 vcn = bh_cpos;
1097 vcn_len = 1;
1098 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1099 cdelta = 0;
1100 /*
1101 * If the number of remaining clusters in the @pages is smaller
1102 * or equal to the number of cached clusters, unlock the
1103 * runlist as the map cache will be used from now on.
1104 */
1105 if (likely(vcn + vcn_len >= cend)) {
1106 up_write(&ni->runlist.lock);
1107 rl_write_locked = FALSE;
1108 rl = NULL;
1109 }
1110 goto map_buffer_cached;
1111 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1112 /* If there are no errors, do the next page. */
1113 if (likely(!err && ++u < nr_pages))
1114 goto do_next_page;
1115 /* If there are no errors, release the runlist lock if we took it. */
1116 if (likely(!err)) {
1117 if (unlikely(rl_write_locked)) {
1118 up_write(&ni->runlist.lock);
1119 rl_write_locked = FALSE;
1120 } else if (unlikely(rl))
1121 up_read(&ni->runlist.lock);
1122 rl = NULL;
1123 }
1124 /* If we issued read requests, let them complete. */
1125 read_lock_irqsave(&ni->size_lock, flags);
1126 initialized_size = ni->initialized_size;
1127 read_unlock_irqrestore(&ni->size_lock, flags);
1128 while (wait_bh > wait) {
1129 bh = *--wait_bh;
1130 wait_on_buffer(bh);
1131 if (likely(buffer_uptodate(bh))) {
1132 page = bh->b_page;
1133 bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
1134 bh_offset(bh);
1135 /*
1136 * If the buffer overflows the initialized size, need
1137 * to zero the overflowing region.
1138 */
1139 if (unlikely(bh_pos + blocksize > initialized_size)) {
1140 u8 *kaddr;
1141 int ofs = 0;
1142
1143 if (likely(bh_pos < initialized_size))
1144 ofs = initialized_size - bh_pos;
1145 kaddr = kmap_atomic(page, KM_USER0);
1146 memset(kaddr + bh_offset(bh) + ofs, 0,
1147 blocksize - ofs);
1148 kunmap_atomic(kaddr, KM_USER0);
1149 flush_dcache_page(page);
1150 }
1151 } else /* if (unlikely(!buffer_uptodate(bh))) */
1152 err = -EIO;
1153 }
1154 if (likely(!err)) {
1155 /* Clear buffer_new on all buffers. */
1156 u = 0;
1157 do {
1158 bh = head = page_buffers(pages[u]);
1159 do {
1160 if (buffer_new(bh))
1161 clear_buffer_new(bh);
1162 } while ((bh = bh->b_this_page) != head);
1163 } while (++u < nr_pages);
1164 ntfs_debug("Done.");
1165 return err;
1166 }
1167 if (status.attr_switched) {
1168 /* Get back to the attribute extent we modified. */
1169 ntfs_attr_reinit_search_ctx(ctx);
1170 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1171 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1172 ntfs_error(vol->sb, "Failed to find required "
1173 "attribute extent of attribute in "
1174 "error code path. Run chkdsk to "
1175 "recover.");
1176 write_lock_irqsave(&ni->size_lock, flags);
1177 ni->itype.compressed.size += vol->cluster_size;
1178 write_unlock_irqrestore(&ni->size_lock, flags);
1179 flush_dcache_mft_record_page(ctx->ntfs_ino);
1180 mark_mft_record_dirty(ctx->ntfs_ino);
1181 /*
1182 * The only thing that is now wrong is the compressed
1183 * size of the base attribute extent which chkdsk
1184 * should be able to fix.
1185 */
1186 NVolSetErrors(vol);
1187 } else {
1188 m = ctx->mrec;
1189 a = ctx->attr;
1190 status.attr_switched = 0;
1191 }
1192 }
1193 /*
1194 * If the runlist has been modified, need to restore it by punching a
1195 * hole into it and we then need to deallocate the on-disk cluster as
1196 * well. Note, we only modify the runlist if we are able to generate a
1197 * new mapping pairs array, i.e. only when the mapped attribute extent
1198 * is not switched.
1199 */
1200 if (status.runlist_merged && !status.attr_switched) {
1201 BUG_ON(!rl_write_locked);
1202 /* Make the file cluster we allocated sparse in the runlist. */
1203 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1204 ntfs_error(vol->sb, "Failed to punch hole into "
1205 "attribute runlist in error code "
1206 "path. Run chkdsk to recover the "
1207 "lost cluster.");
1208 make_bad_inode(vi);
1209 make_bad_inode(VFS_I(base_ni));
1210 NVolSetErrors(vol);
1211 } else /* if (success) */ {
1212 status.runlist_merged = 0;
1213 /*
1214 * Deallocate the on-disk cluster we allocated but only
1215 * if we succeeded in punching its vcn out of the
1216 * runlist.
1217 */
1218 down_write(&vol->lcnbmp_lock);
1219 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1220 ntfs_error(vol->sb, "Failed to release "
1221 "allocated cluster in error "
1222 "code path. Run chkdsk to "
1223 "recover the lost cluster.");
1224 NVolSetErrors(vol);
1225 }
1226 up_write(&vol->lcnbmp_lock);
1227 }
1228 }
1229 /*
1230 * Resize the attribute record to its old size and rebuild the mapping
1231 * pairs array. Note, we only can do this if the runlist has been
1232 * restored to its old state which also implies that the mapped
1233 * attribute extent is not switched.
1234 */
1235 if (status.mp_rebuilt && !status.runlist_merged) {
1236 if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1237 ntfs_error(vol->sb, "Failed to restore attribute "
1238 "record in error code path. Run "
1239 "chkdsk to recover.");
1240 make_bad_inode(vi);
1241 make_bad_inode(VFS_I(base_ni));
1242 NVolSetErrors(vol);
1243 } else /* if (success) */ {
1244 if (ntfs_mapping_pairs_build(vol, (u8*)a +
1245 le16_to_cpu(a->data.non_resident.
1246 mapping_pairs_offset), attr_rec_len -
1247 le16_to_cpu(a->data.non_resident.
1248 mapping_pairs_offset), ni->runlist.rl,
1249 vcn, highest_vcn, NULL)) {
1250 ntfs_error(vol->sb, "Failed to restore "
1251 "mapping pairs array in error "
1252 "code path. Run chkdsk to "
1253 "recover.");
1254 make_bad_inode(vi);
1255 make_bad_inode(VFS_I(base_ni));
1256 NVolSetErrors(vol);
1257 }
1258 flush_dcache_mft_record_page(ctx->ntfs_ino);
1259 mark_mft_record_dirty(ctx->ntfs_ino);
1260 }
1261 }
1262 /* Release the mft record and the attribute. */
1263 if (status.mft_attr_mapped) {
1264 ntfs_attr_put_search_ctx(ctx);
1265 unmap_mft_record(base_ni);
1266 }
1267 /* Release the runlist lock. */
1268 if (rl_write_locked)
1269 up_write(&ni->runlist.lock);
1270 else if (rl)
1271 up_read(&ni->runlist.lock);
1272 /*
1273 * Zero out any newly allocated blocks to avoid exposing stale data.
1274 * If BH_New is set, we know that the block was newly allocated above
1275 * and that it has not been fully zeroed and marked dirty yet.
1276 */
1277 nr_pages = u;
1278 u = 0;
1279 end = bh_cpos << vol->cluster_size_bits;
1280 do {
1281 page = pages[u];
1282 bh = head = page_buffers(page);
1283 do {
1284 if (u == nr_pages &&
1285 ((s64)page->index << PAGE_CACHE_SHIFT) +
1286 bh_offset(bh) >= end)
1287 break;
1288 if (!buffer_new(bh))
1289 continue;
1290 clear_buffer_new(bh);
1291 if (!buffer_uptodate(bh)) {
1292 if (PageUptodate(page))
1293 set_buffer_uptodate(bh);
1294 else {
1295 u8 *kaddr = kmap_atomic(page, KM_USER0);
1296 memset(kaddr + bh_offset(bh), 0,
1297 blocksize);
1298 kunmap_atomic(kaddr, KM_USER0);
1299 flush_dcache_page(page);
1300 set_buffer_uptodate(bh);
1301 }
1302 }
1303 mark_buffer_dirty(bh);
1304 } while ((bh = bh->b_this_page) != head);
1305 } while (++u <= nr_pages);
1306 ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
1307 return err;
1308}
1309
1310/*
1311 * Copy as much as we can into the pages and return the number of bytes which
1312 * were sucessfully copied. If a fault is encountered then clear the pages
1313 * out to (ofs + bytes) and return the number of bytes which were copied.
1314 */
1315static inline size_t ntfs_copy_from_user(struct page **pages,
1316 unsigned nr_pages, unsigned ofs, const char __user *buf,
1317 size_t bytes)
1318{
1319 struct page **last_page = pages + nr_pages;
1320 char *kaddr;
1321 size_t total = 0;
1322 unsigned len;
1323 int left;
1324
1325 do {
1326 len = PAGE_CACHE_SIZE - ofs;
1327 if (len > bytes)
1328 len = bytes;
1329 kaddr = kmap_atomic(*pages, KM_USER0);
1330 left = __copy_from_user_inatomic(kaddr + ofs, buf, len);
1331 kunmap_atomic(kaddr, KM_USER0);
1332 if (unlikely(left)) {
1333 /* Do it the slow way. */
1334 kaddr = kmap(*pages);
1335 left = __copy_from_user(kaddr + ofs, buf, len);
1336 kunmap(*pages);
1337 if (unlikely(left))
1338 goto err_out;
1339 }
1340 total += len;
1341 bytes -= len;
1342 if (!bytes)
1343 break;
1344 buf += len;
1345 ofs = 0;
1346 } while (++pages < last_page);
1347out:
1348 return total;
1349err_out:
1350 total += len - left;
1351 /* Zero the rest of the target like __copy_from_user(). */
1352 while (++pages < last_page) {
1353 bytes -= len;
1354 if (!bytes)
1355 break;
1356 len = PAGE_CACHE_SIZE;
1357 if (len > bytes)
1358 len = bytes;
1359 kaddr = kmap_atomic(*pages, KM_USER0);
1360 memset(kaddr, 0, len);
1361 kunmap_atomic(kaddr, KM_USER0);
1362 }
1363 goto out;
1364}
1365
1366static size_t __ntfs_copy_from_user_iovec(char *vaddr,
1367 const struct iovec *iov, size_t iov_ofs, size_t bytes)
1368{
1369 size_t total = 0;
1370
1371 while (1) {
1372 const char __user *buf = iov->iov_base + iov_ofs;
1373 unsigned len;
1374 size_t left;
1375
1376 len = iov->iov_len - iov_ofs;
1377 if (len > bytes)
1378 len = bytes;
1379 left = __copy_from_user_inatomic(vaddr, buf, len);
1380 total += len;
1381 bytes -= len;
1382 vaddr += len;
1383 if (unlikely(left)) {
1384 /*
1385 * Zero the rest of the target like __copy_from_user().
1386 */
1387 memset(vaddr, 0, bytes);
1388 total -= left;
1389 break;
1390 }
1391 if (!bytes)
1392 break;
1393 iov++;
1394 iov_ofs = 0;
1395 }
1396 return total;
1397}
1398
1399static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1400 size_t *iov_ofsp, size_t bytes)
1401{
1402 const struct iovec *iov = *iovp;
1403 size_t iov_ofs = *iov_ofsp;
1404
1405 while (bytes) {
1406 unsigned len;
1407
1408 len = iov->iov_len - iov_ofs;
1409 if (len > bytes)
1410 len = bytes;
1411 bytes -= len;
1412 iov_ofs += len;
1413 if (iov->iov_len == iov_ofs) {
1414 iov++;
1415 iov_ofs = 0;
1416 }
1417 }
1418 *iovp = iov;
1419 *iov_ofsp = iov_ofs;
1420}
1421
1422/*
1423 * This has the same side-effects and return value as ntfs_copy_from_user().
1424 * The difference is that on a fault we need to memset the remainder of the
1425 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1426 * single-segment behaviour.
1427 *
1428 * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
1429 * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls
1430 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1431 * fact, the only difference between __copy_from_user_inatomic() and
1432 * __copy_from_user() is that the latter calls might_sleep(). And on many
1433 * architectures __copy_from_user_inatomic() is just defined to
1434 * __copy_from_user() so it makes no difference at all on those architectures.
1435 */
1436static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1437 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
1438 size_t *iov_ofs, size_t bytes)
1439{
1440 struct page **last_page = pages + nr_pages;
1441 char *kaddr;
1442 size_t copied, len, total = 0;
1443
1444 do {
1445 len = PAGE_CACHE_SIZE - ofs;
1446 if (len > bytes)
1447 len = bytes;
1448 kaddr = kmap_atomic(*pages, KM_USER0);
1449 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1450 *iov, *iov_ofs, len);
1451 kunmap_atomic(kaddr, KM_USER0);
1452 if (unlikely(copied != len)) {
1453 /* Do it the slow way. */
1454 kaddr = kmap(*pages);
1455 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1456 *iov, *iov_ofs, len);
1457 kunmap(*pages);
1458 if (unlikely(copied != len))
1459 goto err_out;
1460 }
1461 total += len;
1462 bytes -= len;
1463 if (!bytes)
1464 break;
1465 ntfs_set_next_iovec(iov, iov_ofs, len);
1466 ofs = 0;
1467 } while (++pages < last_page);
1468out:
1469 return total;
1470err_out:
1471 total += copied;
1472 /* Zero the rest of the target like __copy_from_user(). */
1473 while (++pages < last_page) {
1474 bytes -= len;
1475 if (!bytes)
1476 break;
1477 len = PAGE_CACHE_SIZE;
1478 if (len > bytes)
1479 len = bytes;
1480 kaddr = kmap_atomic(*pages, KM_USER0);
1481 memset(kaddr, 0, len);
1482 kunmap_atomic(kaddr, KM_USER0);
1483 }
1484 goto out;
1485}
1486
1487static inline void ntfs_flush_dcache_pages(struct page **pages,
1488 unsigned nr_pages)
1489{
1490 BUG_ON(!nr_pages);
1491 do {
1492 /*
1493 * Warning: Do not do the decrement at the same time as the
1494 * call because flush_dcache_page() is a NULL macro on i386
1495 * and hence the decrement never happens.
1496 */
1497 flush_dcache_page(pages[nr_pages]);
1498 } while (--nr_pages > 0);
1499}
1500
1501/**
1502 * ntfs_commit_pages_after_non_resident_write - commit the received data
1503 * @pages: array of destination pages
1504 * @nr_pages: number of pages in @pages
1505 * @pos: byte position in file at which the write begins
1506 * @bytes: number of bytes to be written
1507 *
1508 * See description of ntfs_commit_pages_after_write(), below.
1509 */
1510static inline int ntfs_commit_pages_after_non_resident_write(
1511 struct page **pages, const unsigned nr_pages,
1512 s64 pos, size_t bytes)
1513{
1514 s64 end, initialized_size;
1515 struct inode *vi;
1516 ntfs_inode *ni, *base_ni;
1517 struct buffer_head *bh, *head;
1518 ntfs_attr_search_ctx *ctx;
1519 MFT_RECORD *m;
1520 ATTR_RECORD *a;
1521 unsigned long flags;
1522 unsigned blocksize, u;
1523 int err;
1524
1525 vi = pages[0]->mapping->host;
1526 ni = NTFS_I(vi);
1527 blocksize = 1 << vi->i_blkbits;
1528 end = pos + bytes;
1529 u = 0;
1530 do {
1531 s64 bh_pos;
1532 struct page *page;
1533 BOOL partial;
1534
1535 page = pages[u];
1536 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
1537 bh = head = page_buffers(page);
1538 partial = FALSE;
1539 do {
1540 s64 bh_end;
1541
1542 bh_end = bh_pos + blocksize;
1543 if (bh_end <= pos || bh_pos >= end) {
1544 if (!buffer_uptodate(bh))
1545 partial = TRUE;
1546 } else {
1547 set_buffer_uptodate(bh);
1548 mark_buffer_dirty(bh);
1549 }
1550 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1551 /*
1552 * If all buffers are now uptodate but the page is not, set the
1553 * page uptodate.
1554 */
1555 if (!partial && !PageUptodate(page))
1556 SetPageUptodate(page);
1557 } while (++u < nr_pages);
1558 /*
1559 * Finally, if we do not need to update initialized_size or i_size we
1560 * are finished.
1561 */
1562 read_lock_irqsave(&ni->size_lock, flags);
1563 initialized_size = ni->initialized_size;
1564 read_unlock_irqrestore(&ni->size_lock, flags);
1565 if (end <= initialized_size) {
1566 ntfs_debug("Done.");
1567 return 0;
1568 }
1569 /*
1570 * Update initialized_size/i_size as appropriate, both in the inode and
1571 * the mft record.
1572 */
1573 if (!NInoAttr(ni))
1574 base_ni = ni;
1575 else
1576 base_ni = ni->ext.base_ntfs_ino;
1577 /* Map, pin, and lock the mft record. */
1578 m = map_mft_record(base_ni);
1579 if (IS_ERR(m)) {
1580 err = PTR_ERR(m);
1581 m = NULL;
1582 ctx = NULL;
1583 goto err_out;
1584 }
1585 BUG_ON(!NInoNonResident(ni));
1586 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1587 if (unlikely(!ctx)) {
1588 err = -ENOMEM;
1589 goto err_out;
1590 }
1591 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1592 CASE_SENSITIVE, 0, NULL, 0, ctx);
1593 if (unlikely(err)) {
1594 if (err == -ENOENT)
1595 err = -EIO;
1596 goto err_out;
1597 }
1598 a = ctx->attr;
1599 BUG_ON(!a->non_resident);
1600 write_lock_irqsave(&ni->size_lock, flags);
1601 BUG_ON(end > ni->allocated_size);
1602 ni->initialized_size = end;
1603 a->data.non_resident.initialized_size = cpu_to_sle64(end);
1604 if (end > i_size_read(vi)) {
1605 i_size_write(vi, end);
1606 a->data.non_resident.data_size =
1607 a->data.non_resident.initialized_size;
1608 }
1609 write_unlock_irqrestore(&ni->size_lock, flags);
1610 /* Mark the mft record dirty, so it gets written back. */
1611 flush_dcache_mft_record_page(ctx->ntfs_ino);
1612 mark_mft_record_dirty(ctx->ntfs_ino);
1613 ntfs_attr_put_search_ctx(ctx);
1614 unmap_mft_record(base_ni);
1615 ntfs_debug("Done.");
1616 return 0;
1617err_out:
1618 if (ctx)
1619 ntfs_attr_put_search_ctx(ctx);
1620 if (m)
1621 unmap_mft_record(base_ni);
1622 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1623 "code %i).", err);
1624 if (err != -ENOMEM) {
1625 NVolSetErrors(ni->vol);
1626 make_bad_inode(VFS_I(base_ni));
1627 make_bad_inode(vi);
1628 }
1629 return err;
1630}
1631
1632/**
1633 * ntfs_commit_pages_after_write - commit the received data
1634 * @pages: array of destination pages
1635 * @nr_pages: number of pages in @pages
1636 * @pos: byte position in file at which the write begins
1637 * @bytes: number of bytes to be written
1638 *
1639 * This is called from ntfs_file_buffered_write() with i_sem held on the inode
1640 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1641 * locked but not kmap()ped. The source data has already been copied into the
1642 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
1643 * the data was copied (for non-resident attributes only) and it returned
1644 * success.
1645 *
1646 * Need to set uptodate and mark dirty all buffers within the boundary of the
1647 * write. If all buffers in a page are uptodate we set the page uptodate, too.
1648 *
1649 * Setting the buffers dirty ensures that they get written out later when
1650 * ntfs_writepage() is invoked by the VM.
1651 *
1652 * Finally, we need to update i_size and initialized_size as appropriate both
1653 * in the inode and the mft record.
1654 *
1655 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1656 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1657 * page are uptodate, and updates i_size if the end of io is beyond i_size. In
1658 * that case, it also marks the inode dirty.
1659 *
1660 * If things have gone as outlined in
1661 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1662 * content modifications here for non-resident attributes. For resident
1663 * attributes we need to do the uptodate bringing here which we combine with
1664 * the copying into the mft record which means we save one atomic kmap.
1665 *
1666 * Return 0 on success or -errno on error.
1667 */
1668static int ntfs_commit_pages_after_write(struct page **pages,
1669 const unsigned nr_pages, s64 pos, size_t bytes)
1670{
1671 s64 end, initialized_size;
1672 loff_t i_size;
1673 struct inode *vi;
1674 ntfs_inode *ni, *base_ni;
1675 struct page *page;
1676 ntfs_attr_search_ctx *ctx;
1677 MFT_RECORD *m;
1678 ATTR_RECORD *a;
1679 char *kattr, *kaddr;
1680 unsigned long flags;
1681 u32 attr_len;
1682 int err;
1683
1684 BUG_ON(!nr_pages);
1685 BUG_ON(!pages);
1686 page = pages[0];
1687 BUG_ON(!page);
1688 vi = page->mapping->host;
1689 ni = NTFS_I(vi);
1690 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1691 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1692 vi->i_ino, ni->type, page->index, nr_pages,
1693 (long long)pos, bytes);
1694 if (NInoNonResident(ni))
1695 return ntfs_commit_pages_after_non_resident_write(pages,
1696 nr_pages, pos, bytes);
1697 BUG_ON(nr_pages > 1);
1698 /*
1699 * Attribute is resident, implying it is not compressed, encrypted, or
1700 * sparse.
1701 */
1702 if (!NInoAttr(ni))
1703 base_ni = ni;
1704 else
1705 base_ni = ni->ext.base_ntfs_ino;
1706 BUG_ON(NInoNonResident(ni));
1707 /* Map, pin, and lock the mft record. */
1708 m = map_mft_record(base_ni);
1709 if (IS_ERR(m)) {
1710 err = PTR_ERR(m);
1711 m = NULL;
1712 ctx = NULL;
1713 goto err_out;
1714 }
1715 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1716 if (unlikely(!ctx)) {
1717 err = -ENOMEM;
1718 goto err_out;
1719 }
1720 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1721 CASE_SENSITIVE, 0, NULL, 0, ctx);
1722 if (unlikely(err)) {
1723 if (err == -ENOENT)
1724 err = -EIO;
1725 goto err_out;
1726 }
1727 a = ctx->attr;
1728 BUG_ON(a->non_resident);
1729 /* The total length of the attribute value. */
1730 attr_len = le32_to_cpu(a->data.resident.value_length);
1731 i_size = i_size_read(vi);
1732 BUG_ON(attr_len != i_size);
1733 BUG_ON(pos > attr_len);
1734 end = pos + bytes;
1735 BUG_ON(end > le32_to_cpu(a->length) -
1736 le16_to_cpu(a->data.resident.value_offset));
1737 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1738 kaddr = kmap_atomic(page, KM_USER0);
1739 /* Copy the received data from the page to the mft record. */
1740 memcpy(kattr + pos, kaddr + pos, bytes);
1741 /* Update the attribute length if necessary. */
1742 if (end > attr_len) {
1743 attr_len = end;
1744 a->data.resident.value_length = cpu_to_le32(attr_len);
1745 }
1746 /*
1747 * If the page is not uptodate, bring the out of bounds area(s)
1748 * uptodate by copying data from the mft record to the page.
1749 */
1750 if (!PageUptodate(page)) {
1751 if (pos > 0)
1752 memcpy(kaddr, kattr, pos);
1753 if (end < attr_len)
1754 memcpy(kaddr + end, kattr + end, attr_len - end);
1755 /* Zero the region outside the end of the attribute value. */
1756 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1757 flush_dcache_page(page);
1758 SetPageUptodate(page);
1759 }
1760 kunmap_atomic(kaddr, KM_USER0);
1761 /* Update initialized_size/i_size if necessary. */
1762 read_lock_irqsave(&ni->size_lock, flags);
1763 initialized_size = ni->initialized_size;
1764 BUG_ON(end > ni->allocated_size);
1765 read_unlock_irqrestore(&ni->size_lock, flags);
1766 BUG_ON(initialized_size != i_size);
1767 if (end > initialized_size) {
1768 unsigned long flags;
1769
1770 write_lock_irqsave(&ni->size_lock, flags);
1771 ni->initialized_size = end;
1772 i_size_write(vi, end);
1773 write_unlock_irqrestore(&ni->size_lock, flags);
1774 }
1775 /* Mark the mft record dirty, so it gets written back. */
1776 flush_dcache_mft_record_page(ctx->ntfs_ino);
1777 mark_mft_record_dirty(ctx->ntfs_ino);
1778 ntfs_attr_put_search_ctx(ctx);
1779 unmap_mft_record(base_ni);
1780 ntfs_debug("Done.");
1781 return 0;
1782err_out:
1783 if (err == -ENOMEM) {
1784 ntfs_warning(vi->i_sb, "Error allocating memory required to "
1785 "commit the write.");
1786 if (PageUptodate(page)) {
1787 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1788 "dirty so the write will be retried "
1789 "later on by the VM.");
1790 /*
1791 * Put the page on mapping->dirty_pages, but leave its
1792 * buffers' dirty state as-is.
1793 */
1794 __set_page_dirty_nobuffers(page);
1795 err = 0;
1796 } else
1797 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1798 "data has been lost.");
1799 } else {
1800 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1801 "with error %i.", err);
1802 NVolSetErrors(ni->vol);
1803 make_bad_inode(VFS_I(base_ni));
1804 make_bad_inode(vi);
1805 }
1806 if (ctx)
1807 ntfs_attr_put_search_ctx(ctx);
1808 if (m)
1809 unmap_mft_record(base_ni);
1810 return err;
1811}
1812
1813/**
1814 * ntfs_file_buffered_write -
1815 *
1816 * Locking: The vfs is holding ->i_sem on the inode.
1817 */
1818static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1819 const struct iovec *iov, unsigned long nr_segs,
1820 loff_t pos, loff_t *ppos, size_t count)
1821{
1822 struct file *file = iocb->ki_filp;
1823 struct address_space *mapping = file->f_mapping;
1824 struct inode *vi = mapping->host;
1825 ntfs_inode *ni = NTFS_I(vi);
1826 ntfs_volume *vol = ni->vol;
1827 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1828 struct page *cached_page = NULL;
1829 char __user *buf = NULL;
1830 s64 end, ll;
1831 VCN last_vcn;
1832 LCN lcn;
1833 unsigned long flags;
1834 size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
1835 ssize_t status, written;
1836 unsigned nr_pages;
1837 int err;
1838 struct pagevec lru_pvec;
1839
1840 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1841 "pos 0x%llx, count 0x%lx.",
1842 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
1843 (unsigned long long)pos, (unsigned long)count);
1844 if (unlikely(!count))
1845 return 0;
1846 BUG_ON(NInoMstProtected(ni));
1847 /*
1848 * If the attribute is not an index root and it is encrypted or
1849 * compressed, we cannot write to it yet. Note we need to check for
1850 * AT_INDEX_ALLOCATION since this is the type of both directory and
1851 * index inodes.
1852 */
1853 if (ni->type != AT_INDEX_ALLOCATION) {
1854 /* If file is encrypted, deny access, just like NT4. */
1855 if (NInoEncrypted(ni)) {
1856 /*
1857 * Reminder for later: Encrypted files are _always_
1858 * non-resident so that the content can always be
1859 * encrypted.
1860 */
1861 ntfs_debug("Denying write access to encrypted file.");
1862 return -EACCES;
1863 }
1864 if (NInoCompressed(ni)) {
1865 /* Only unnamed $DATA attribute can be compressed. */
1866 BUG_ON(ni->type != AT_DATA);
1867 BUG_ON(ni->name_len);
1868 /*
1869 * Reminder for later: If resident, the data is not
1870 * actually compressed. Only on the switch to non-
1871 * resident does compression kick in. This is in
1872 * contrast to encrypted files (see above).
1873 */
1874 ntfs_error(vi->i_sb, "Writing to compressed files is "
1875 "not implemented yet. Sorry.");
1876 return -EOPNOTSUPP;
1877 }
1878 }
1879 /*
1880 * If a previous ntfs_truncate() failed, repeat it and abort if it
1881 * fails again.
1882 */
1883 if (unlikely(NInoTruncateFailed(ni))) {
1884 down_write(&vi->i_alloc_sem);
1885 err = ntfs_truncate(vi);
1886 up_write(&vi->i_alloc_sem);
1887 if (err || NInoTruncateFailed(ni)) {
1888 if (!err)
1889 err = -EIO;
1890 ntfs_error(vol->sb, "Cannot perform write to inode "
1891 "0x%lx, attribute type 0x%x, because "
1892 "ntfs_truncate() failed (error code "
1893 "%i).", vi->i_ino,
1894 (unsigned)le32_to_cpu(ni->type), err);
1895 return err;
1896 }
1897 }
1898 /* The first byte after the write. */
1899 end = pos + count;
1900 /*
1901 * If the write goes beyond the allocated size, extend the allocation
1902 * to cover the whole of the write, rounded up to the nearest cluster.
1903 */
1904 read_lock_irqsave(&ni->size_lock, flags);
1905 ll = ni->allocated_size;
1906 read_unlock_irqrestore(&ni->size_lock, flags);
1907 if (end > ll) {
1908 /* Extend the allocation without changing the data size. */
1909 ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
1910 if (likely(ll >= 0)) {
1911 BUG_ON(pos >= ll);
1912 /* If the extension was partial truncate the write. */
1913 if (end > ll) {
1914 ntfs_debug("Truncating write to inode 0x%lx, "
1915 "attribute type 0x%x, because "
1916 "the allocation was only "
1917 "partially extended.",
1918 vi->i_ino, (unsigned)
1919 le32_to_cpu(ni->type));
1920 end = ll;
1921 count = ll - pos;
1922 }
1923 } else {
1924 err = ll;
1925 read_lock_irqsave(&ni->size_lock, flags);
1926 ll = ni->allocated_size;
1927 read_unlock_irqrestore(&ni->size_lock, flags);
1928 /* Perform a partial write if possible or fail. */
1929 if (pos < ll) {
1930 ntfs_debug("Truncating write to inode 0x%lx, "
1931 "attribute type 0x%x, because "
1932 "extending the allocation "
1933 "failed (error code %i).",
1934 vi->i_ino, (unsigned)
1935 le32_to_cpu(ni->type), err);
1936 end = ll;
1937 count = ll - pos;
1938 } else {
1939 ntfs_error(vol->sb, "Cannot perform write to "
1940 "inode 0x%lx, attribute type "
1941 "0x%x, because extending the "
1942 "allocation failed (error "
1943 "code %i).", vi->i_ino,
1944 (unsigned)
1945 le32_to_cpu(ni->type), err);
1946 return err;
1947 }
1948 }
1949 }
1950 pagevec_init(&lru_pvec, 0);
1951 written = 0;
1952 /*
1953 * If the write starts beyond the initialized size, extend it up to the
1954 * beginning of the write and initialize all non-sparse space between
1955 * the old initialized size and the new one. This automatically also
1956 * increments the vfs inode->i_size to keep it above or equal to the
1957 * initialized_size.
1958 */
1959 read_lock_irqsave(&ni->size_lock, flags);
1960 ll = ni->initialized_size;
1961 read_unlock_irqrestore(&ni->size_lock, flags);
1962 if (pos > ll) {
1963 err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
1964 &lru_pvec);
1965 if (err < 0) {
1966 ntfs_error(vol->sb, "Cannot perform write to inode "
1967 "0x%lx, attribute type 0x%x, because "
1968 "extending the initialized size "
1969 "failed (error code %i).", vi->i_ino,
1970 (unsigned)le32_to_cpu(ni->type), err);
1971 status = err;
1972 goto err_out;
1973 }
1974 }
1975 /*
1976 * Determine the number of pages per cluster for non-resident
1977 * attributes.
1978 */
1979 nr_pages = 1;
1980 if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
1981 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
1982 /* Finally, perform the actual write. */
1983 last_vcn = -1;
1984 if (likely(nr_segs == 1))
1985 buf = iov->iov_base;
1986 do {
1987 VCN vcn;
1988 pgoff_t idx, start_idx;
1989 unsigned ofs, do_pages, u;
1990 size_t copied;
1991
1992 start_idx = idx = pos >> PAGE_CACHE_SHIFT;
1993 ofs = pos & ~PAGE_CACHE_MASK;
1994 bytes = PAGE_CACHE_SIZE - ofs;
1995 do_pages = 1;
1996 if (nr_pages > 1) {
1997 vcn = pos >> vol->cluster_size_bits;
1998 if (vcn != last_vcn) {
1999 last_vcn = vcn;
2000 /*
2001 * Get the lcn of the vcn the write is in. If
2002 * it is a hole, need to lock down all pages in
2003 * the cluster.
2004 */
2005 down_read(&ni->runlist.lock);
2006 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
2007 vol->cluster_size_bits, FALSE);
2008 up_read(&ni->runlist.lock);
2009 if (unlikely(lcn < LCN_HOLE)) {
2010 status = -EIO;
2011 if (lcn == LCN_ENOMEM)
2012 status = -ENOMEM;
2013 else
2014 ntfs_error(vol->sb, "Cannot "
2015 "perform write to "
2016 "inode 0x%lx, "
2017 "attribute type 0x%x, "
2018 "because the attribute "
2019 "is corrupt.",
2020 vi->i_ino, (unsigned)
2021 le32_to_cpu(ni->type));
2022 break;
2023 }
2024 if (lcn == LCN_HOLE) {
2025 start_idx = (pos & ~(s64)
2026 vol->cluster_size_mask)
2027 >> PAGE_CACHE_SHIFT;
2028 bytes = vol->cluster_size - (pos &
2029 vol->cluster_size_mask);
2030 do_pages = nr_pages;
2031 }
2032 }
2033 }
2034 if (bytes > count)
2035 bytes = count;
2036 /*
2037 * Bring in the user page(s) that we will copy from _first_.
2038 * Otherwise there is a nasty deadlock on copying from the same
2039 * page(s) as we are writing to, without it/them being marked
2040 * up-to-date. Note, at present there is nothing to stop the
2041 * pages being swapped out between us bringing them into memory
2042 * and doing the actual copying.
2043 */
2044 if (likely(nr_segs == 1))
2045 ntfs_fault_in_pages_readable(buf, bytes);
2046 else
2047 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2048 /* Get and lock @do_pages starting at index @start_idx. */
2049 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2050 pages, &cached_page, &lru_pvec);
2051 if (unlikely(status))
2052 break;
2053 /*
2054 * For non-resident attributes, we need to fill any holes with
2055 * actual clusters and ensure all bufferes are mapped. We also
2056 * need to bring uptodate any buffers that are only partially
2057 * being written to.
2058 */
2059 if (NInoNonResident(ni)) {
2060 status = ntfs_prepare_pages_for_non_resident_write(
2061 pages, do_pages, pos, bytes);
2062 if (unlikely(status)) {
2063 loff_t i_size;
2064
2065 do {
2066 unlock_page(pages[--do_pages]);
2067 page_cache_release(pages[do_pages]);
2068 } while (do_pages);
2069 /*
2070 * The write preparation may have instantiated
2071 * allocated space outside i_size. Trim this
2072 * off again. We can ignore any errors in this
2073 * case as we will just be waisting a bit of
2074 * allocated space, which is not a disaster.
2075 */
2076 i_size = i_size_read(vi);
2077 if (pos + bytes > i_size)
2078 vmtruncate(vi, i_size);
2079 break;
2080 }
2081 }
2082 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
2083 if (likely(nr_segs == 1)) {
2084 copied = ntfs_copy_from_user(pages + u, do_pages - u,
2085 ofs, buf, bytes);
2086 buf += copied;
2087 } else
2088 copied = ntfs_copy_from_user_iovec(pages + u,
2089 do_pages - u, ofs, &iov, &iov_ofs,
2090 bytes);
2091 ntfs_flush_dcache_pages(pages + u, do_pages - u);
2092 status = ntfs_commit_pages_after_write(pages, do_pages, pos,
2093 bytes);
2094 if (likely(!status)) {
2095 written += copied;
2096 count -= copied;
2097 pos += copied;
2098 if (unlikely(copied != bytes))
2099 status = -EFAULT;
2100 }
2101 do {
2102 unlock_page(pages[--do_pages]);
2103 mark_page_accessed(pages[do_pages]);
2104 page_cache_release(pages[do_pages]);
2105 } while (do_pages);
2106 if (unlikely(status))
2107 break;
2108 balance_dirty_pages_ratelimited(mapping);
2109 cond_resched();
2110 } while (count);
2111err_out:
2112 *ppos = pos;
2113 if (cached_page)
2114 page_cache_release(cached_page);
2115 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2116 if (likely(!status)) {
2117 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2118 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2119 status = generic_osync_inode(vi, mapping,
2120 OSYNC_METADATA|OSYNC_DATA);
2121 }
2122 }
2123 pagevec_lru_add(&lru_pvec);
2124 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2125 written ? "written" : "status", (unsigned long)written,
2126 (long)status);
2127 return written ? written : status;
2128}
2129
2130/**
2131 * ntfs_file_aio_write_nolock -
2132 */
2133static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2134 const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
2135{
2136 struct file *file = iocb->ki_filp;
2137 struct address_space *mapping = file->f_mapping;
2138 struct inode *inode = mapping->host;
2139 loff_t pos;
2140 unsigned long seg;
2141 size_t count; /* after file limit checks */
2142 ssize_t written, err;
2143
2144 count = 0;
2145 for (seg = 0; seg < nr_segs; seg++) {
2146 const struct iovec *iv = &iov[seg];
2147 /*
2148 * If any segment has a negative length, or the cumulative
2149 * length ever wraps negative then return -EINVAL.
2150 */
2151 count += iv->iov_len;
2152 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
2153 return -EINVAL;
2154 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2155 continue;
2156 if (!seg)
2157 return -EFAULT;
2158 nr_segs = seg;
2159 count -= iv->iov_len; /* This segment is no good */
2160 break;
2161 }
2162 pos = *ppos;
2163 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2164 /* We can write back this queue in page reclaim. */
2165 current->backing_dev_info = mapping->backing_dev_info;
2166 written = 0;
2167 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2168 if (err)
2169 goto out;
2170 if (!count)
2171 goto out;
2172 err = remove_suid(file->f_dentry);
2173 if (err)
2174 goto out;
2175 inode_update_time(inode, 1);
2176 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2177 count);
2178out:
2179 current->backing_dev_info = NULL;
2180 return written ? written : err;
2181}
2182
2183/**
2184 * ntfs_file_aio_write -
2185 */
2186static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
2187 size_t count, loff_t pos)
2188{
2189 struct file *file = iocb->ki_filp;
2190 struct address_space *mapping = file->f_mapping;
2191 struct inode *inode = mapping->host;
2192 ssize_t ret;
2193 struct iovec local_iov = { .iov_base = (void __user *)buf,
2194 .iov_len = count };
2195
2196 BUG_ON(iocb->ki_pos != pos);
2197
2198 down(&inode->i_sem);
2199 ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
2200 up(&inode->i_sem);
2201 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2202 int err = sync_page_range(inode, mapping, pos, ret);
2203 if (err < 0)
2204 ret = err;
2205 }
2206 return ret;
2207}
2208
2209/**
2210 * ntfs_file_writev -
2211 *
2212 * Basically the same as generic_file_writev() except that it ends up calling
2213 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2214 */
2215static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2216 unsigned long nr_segs, loff_t *ppos)
2217{
2218 struct address_space *mapping = file->f_mapping;
2219 struct inode *inode = mapping->host;
2220 struct kiocb kiocb;
2221 ssize_t ret;
2222
2223 down(&inode->i_sem);
2224 init_sync_kiocb(&kiocb, file);
2225 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2226 if (ret == -EIOCBQUEUED)
2227 ret = wait_on_sync_kiocb(&kiocb);
2228 up(&inode->i_sem);
2229 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2230 int err = sync_page_range(inode, mapping, *ppos - ret, ret);
2231 if (err < 0)
2232 ret = err;
2233 }
2234 return ret;
2235}
2236
2237/**
2238 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2239 */
2240static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2241 size_t count, loff_t *ppos)
2242{
2243 struct iovec local_iov = { .iov_base = (void __user *)buf,
2244 .iov_len = count };
2245
2246 return ntfs_file_writev(file, &local_iov, 1, ppos);
2247}
2248
2249/**
59 * ntfs_file_fsync - sync a file to disk 2250 * ntfs_file_fsync - sync a file to disk
60 * @filp: file to be synced 2251 * @filp: file to be synced
61 * @dentry: dentry describing the file to sync 2252 * @dentry: dentry describing the file to sync
@@ -113,39 +2304,39 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
113#endif /* NTFS_RW */ 2304#endif /* NTFS_RW */
114 2305
115struct file_operations ntfs_file_ops = { 2306struct file_operations ntfs_file_ops = {
116 .llseek = generic_file_llseek, /* Seek inside file. */ 2307 .llseek = generic_file_llseek, /* Seek inside file. */
117 .read = generic_file_read, /* Read from file. */ 2308 .read = generic_file_read, /* Read from file. */
118 .aio_read = generic_file_aio_read, /* Async read from file. */ 2309 .aio_read = generic_file_aio_read, /* Async read from file. */
119 .readv = generic_file_readv, /* Read from file. */ 2310 .readv = generic_file_readv, /* Read from file. */
120#ifdef NTFS_RW 2311#ifdef NTFS_RW
121 .write = generic_file_write, /* Write to file. */ 2312 .write = ntfs_file_write, /* Write to file. */
122 .aio_write = generic_file_aio_write, /* Async write to file. */ 2313 .aio_write = ntfs_file_aio_write, /* Async write to file. */
123 .writev = generic_file_writev, /* Write to file. */ 2314 .writev = ntfs_file_writev, /* Write to file. */
124 /*.release = ,*/ /* Last file is closed. See 2315 /*.release = ,*/ /* Last file is closed. See
125 fs/ext2/file.c:: 2316 fs/ext2/file.c::
126 ext2_release_file() for 2317 ext2_release_file() for
127 how to use this to discard 2318 how to use this to discard
128 preallocated space for 2319 preallocated space for
129 write opened files. */ 2320 write opened files. */
130 .fsync = ntfs_file_fsync, /* Sync a file to disk. */ 2321 .fsync = ntfs_file_fsync, /* Sync a file to disk. */
131 /*.aio_fsync = ,*/ /* Sync all outstanding async 2322 /*.aio_fsync = ,*/ /* Sync all outstanding async
132 i/o operations on a 2323 i/o operations on a
133 kiocb. */ 2324 kiocb. */
134#endif /* NTFS_RW */ 2325#endif /* NTFS_RW */
135 /*.ioctl = ,*/ /* Perform function on the 2326 /*.ioctl = ,*/ /* Perform function on the
136 mounted filesystem. */ 2327 mounted filesystem. */
137 .mmap = generic_file_mmap, /* Mmap file. */ 2328 .mmap = generic_file_mmap, /* Mmap file. */
138 .open = ntfs_file_open, /* Open file. */ 2329 .open = ntfs_file_open, /* Open file. */
139 .sendfile = generic_file_sendfile, /* Zero-copy data send with 2330 .sendfile = generic_file_sendfile, /* Zero-copy data send with
140 the data source being on 2331 the data source being on
141 the ntfs partition. We 2332 the ntfs partition. We do
142 do not need to care about 2333 not need to care about the
143 the data destination. */ 2334 data destination. */
144 /*.sendpage = ,*/ /* Zero-copy data send with 2335 /*.sendpage = ,*/ /* Zero-copy data send with
145 the data destination being 2336 the data destination being
146 on the ntfs partition. We 2337 on the ntfs partition. We
147 do not need to care about 2338 do not need to care about
148 the data source. */ 2339 the data source. */
149}; 2340};
150 2341
151struct inode_operations ntfs_file_inode_ops = { 2342struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 7ec045131808..b24f4c4b2c5c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -30,6 +30,7 @@
30#include "debug.h" 30#include "debug.h"
31#include "inode.h" 31#include "inode.h"
32#include "attrib.h" 32#include "attrib.h"
33#include "lcnalloc.h"
33#include "malloc.h" 34#include "malloc.h"
34#include "mft.h" 35#include "mft.h"
35#include "time.h" 36#include "time.h"
@@ -2291,11 +2292,16 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
2291 2292
2292#ifdef NTFS_RW 2293#ifdef NTFS_RW
2293 2294
2295static const char *es = " Leaving inconsistent metadata. Unmount and run "
2296 "chkdsk.";
2297
2294/** 2298/**
2295 * ntfs_truncate - called when the i_size of an ntfs inode is changed 2299 * ntfs_truncate - called when the i_size of an ntfs inode is changed
2296 * @vi: inode for which the i_size was changed 2300 * @vi: inode for which the i_size was changed
2297 * 2301 *
2298 * We do not support i_size changes yet. 2302 * We only support i_size changes for normal files at present, i.e. not
2303 * compressed and not encrypted. This is enforced in ntfs_setattr(), see
2304 * below.
2299 * 2305 *
2300 * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and 2306 * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
2301 * that the change is allowed. 2307 * that the change is allowed.
@@ -2306,80 +2312,499 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
2306 * Returns 0 on success or -errno on error. 2312 * Returns 0 on success or -errno on error.
2307 * 2313 *
2308 * Called with ->i_sem held. In all but one case ->i_alloc_sem is held for 2314 * Called with ->i_sem held. In all but one case ->i_alloc_sem is held for
2309 * writing. The only case where ->i_alloc_sem is not held is 2315 * writing. The only case in the kernel where ->i_alloc_sem is not held is
2310 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called 2316 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
2311 * with the current i_size as the offset which means that it is a noop as far 2317 * with the current i_size as the offset. The analogous place in NTFS is in
2312 * as ntfs_truncate() is concerned. 2318 * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
2319 * without holding ->i_alloc_sem.
2313 */ 2320 */
2314int ntfs_truncate(struct inode *vi) 2321int ntfs_truncate(struct inode *vi)
2315{ 2322{
2316 ntfs_inode *ni = NTFS_I(vi); 2323 s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
2324 VCN highest_vcn;
2325 unsigned long flags;
2326 ntfs_inode *base_ni, *ni = NTFS_I(vi);
2317 ntfs_volume *vol = ni->vol; 2327 ntfs_volume *vol = ni->vol;
2318 ntfs_attr_search_ctx *ctx; 2328 ntfs_attr_search_ctx *ctx;
2319 MFT_RECORD *m; 2329 MFT_RECORD *m;
2320 ATTR_RECORD *a; 2330 ATTR_RECORD *a;
2321 const char *te = " Leaving file length out of sync with i_size."; 2331 const char *te = " Leaving file length out of sync with i_size.";
2322 int err; 2332 int err, mp_size, size_change, alloc_change;
2333 u32 attr_len;
2323 2334
2324 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2335 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2325 BUG_ON(NInoAttr(ni)); 2336 BUG_ON(NInoAttr(ni));
2337 BUG_ON(S_ISDIR(vi->i_mode));
2338 BUG_ON(NInoMstProtected(ni));
2326 BUG_ON(ni->nr_extents < 0); 2339 BUG_ON(ni->nr_extents < 0);
2327 m = map_mft_record(ni); 2340retry_truncate:
2341 /*
2342 * Lock the runlist for writing and map the mft record to ensure it is
2343 * safe to mess with the attribute runlist and sizes.
2344 */
2345 down_write(&ni->runlist.lock);
2346 if (!NInoAttr(ni))
2347 base_ni = ni;
2348 else
2349 base_ni = ni->ext.base_ntfs_ino;
2350 m = map_mft_record(base_ni);
2328 if (IS_ERR(m)) { 2351 if (IS_ERR(m)) {
2329 err = PTR_ERR(m); 2352 err = PTR_ERR(m);
2330 ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " 2353 ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
2331 "(error code %d).%s", vi->i_ino, err, te); 2354 "(error code %d).%s", vi->i_ino, err, te);
2332 ctx = NULL; 2355 ctx = NULL;
2333 m = NULL; 2356 m = NULL;
2334 goto err_out; 2357 goto old_bad_out;
2335 } 2358 }
2336 ctx = ntfs_attr_get_search_ctx(ni, m); 2359 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2337 if (unlikely(!ctx)) { 2360 if (unlikely(!ctx)) {
2338 ntfs_error(vi->i_sb, "Failed to allocate a search context for " 2361 ntfs_error(vi->i_sb, "Failed to allocate a search context for "
2339 "inode 0x%lx (not enough memory).%s", 2362 "inode 0x%lx (not enough memory).%s",
2340 vi->i_ino, te); 2363 vi->i_ino, te);
2341 err = -ENOMEM; 2364 err = -ENOMEM;
2342 goto err_out; 2365 goto old_bad_out;
2343 } 2366 }
2344 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 2367 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2345 CASE_SENSITIVE, 0, NULL, 0, ctx); 2368 CASE_SENSITIVE, 0, NULL, 0, ctx);
2346 if (unlikely(err)) { 2369 if (unlikely(err)) {
2347 if (err == -ENOENT) 2370 if (err == -ENOENT) {
2348 ntfs_error(vi->i_sb, "Open attribute is missing from " 2371 ntfs_error(vi->i_sb, "Open attribute is missing from "
2349 "mft record. Inode 0x%lx is corrupt. " 2372 "mft record. Inode 0x%lx is corrupt. "
2350 "Run chkdsk.", vi->i_ino); 2373 "Run chkdsk.%s", vi->i_ino, te);
2351 else 2374 err = -EIO;
2375 } else
2352 ntfs_error(vi->i_sb, "Failed to lookup attribute in " 2376 ntfs_error(vi->i_sb, "Failed to lookup attribute in "
2353 "inode 0x%lx (error code %d).", 2377 "inode 0x%lx (error code %d).%s",
2354 vi->i_ino, err); 2378 vi->i_ino, err, te);
2355 goto err_out; 2379 goto old_bad_out;
2356 } 2380 }
2381 m = ctx->mrec;
2357 a = ctx->attr; 2382 a = ctx->attr;
2358 /* If the size has not changed there is nothing to do. */ 2383 /*
2359 if (ntfs_attr_size(a) == i_size_read(vi)) 2384 * The i_size of the vfs inode is the new size for the attribute value.
2360 goto done; 2385 */
2361 // TODO: Implement the truncate... 2386 new_size = i_size_read(vi);
2362 ntfs_error(vi->i_sb, "Inode size has changed but this is not " 2387 /* The current size of the attribute value is the old size. */
2363 "implemented yet. Resetting inode size to old value. " 2388 old_size = ntfs_attr_size(a);
2364 " This is most likely a bug in the ntfs driver!"); 2389 /* Calculate the new allocated size. */
2365 i_size_write(vi, ntfs_attr_size(a)); 2390 if (NInoNonResident(ni))
2366done: 2391 new_alloc_size = (new_size + vol->cluster_size - 1) &
2392 ~(s64)vol->cluster_size_mask;
2393 else
2394 new_alloc_size = (new_size + 7) & ~7;
2395 /* The current allocated size is the old allocated size. */
2396 read_lock_irqsave(&ni->size_lock, flags);
2397 old_alloc_size = ni->allocated_size;
2398 read_unlock_irqrestore(&ni->size_lock, flags);
2399 /*
2400 * The change in the file size. This will be 0 if no change, >0 if the
2401 * size is growing, and <0 if the size is shrinking.
2402 */
2403 size_change = -1;
2404 if (new_size - old_size >= 0) {
2405 size_change = 1;
2406 if (new_size == old_size)
2407 size_change = 0;
2408 }
2409 /* As above for the allocated size. */
2410 alloc_change = -1;
2411 if (new_alloc_size - old_alloc_size >= 0) {
2412 alloc_change = 1;
2413 if (new_alloc_size == old_alloc_size)
2414 alloc_change = 0;
2415 }
2416 /*
2417 * If neither the size nor the allocation are being changed there is
2418 * nothing to do.
2419 */
2420 if (!size_change && !alloc_change)
2421 goto unm_done;
2422 /* If the size is changing, check if new size is allowed in $AttrDef. */
2423 if (size_change) {
2424 err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
2425 if (unlikely(err)) {
2426 if (err == -ERANGE) {
2427 ntfs_error(vol->sb, "Truncate would cause the "
2428 "inode 0x%lx to %simum size "
2429 "for its attribute type "
2430 "(0x%x). Aborting truncate.",
2431 vi->i_ino,
2432 new_size > old_size ? "exceed "
2433 "the max" : "go under the min",
2434 le32_to_cpu(ni->type));
2435 err = -EFBIG;
2436 } else {
2437 ntfs_error(vol->sb, "Inode 0x%lx has unknown "
2438 "attribute type 0x%x. "
2439 "Aborting truncate.",
2440 vi->i_ino,
2441 le32_to_cpu(ni->type));
2442 err = -EIO;
2443 }
2444 /* Reset the vfs inode size to the old size. */
2445 i_size_write(vi, old_size);
2446 goto err_out;
2447 }
2448 }
2449 if (NInoCompressed(ni) || NInoEncrypted(ni)) {
2450 ntfs_warning(vi->i_sb, "Changes in inode size are not "
2451 "supported yet for %s files, ignoring.",
2452 NInoCompressed(ni) ? "compressed" :
2453 "encrypted");
2454 err = -EOPNOTSUPP;
2455 goto bad_out;
2456 }
2457 if (a->non_resident)
2458 goto do_non_resident_truncate;
2459 BUG_ON(NInoNonResident(ni));
2460 /* Resize the attribute record to best fit the new attribute size. */
2461 if (new_size < vol->mft_record_size &&
2462 !ntfs_resident_attr_value_resize(m, a, new_size)) {
2463 unsigned long flags;
2464
2465 /* The resize succeeded! */
2466 flush_dcache_mft_record_page(ctx->ntfs_ino);
2467 mark_mft_record_dirty(ctx->ntfs_ino);
2468 write_lock_irqsave(&ni->size_lock, flags);
2469 /* Update the sizes in the ntfs inode and all is done. */
2470 ni->allocated_size = le32_to_cpu(a->length) -
2471 le16_to_cpu(a->data.resident.value_offset);
2472 /*
2473 * Note ntfs_resident_attr_value_resize() has already done any
2474 * necessary data clearing in the attribute record. When the
2475 * file is being shrunk vmtruncate() will already have cleared
2476 * the top part of the last partial page, i.e. since this is
2477 * the resident case this is the page with index 0. However,
2478 * when the file is being expanded, the page cache page data
2479 * between the old data_size, i.e. old_size, and the new_size
2480 * has not been zeroed. Fortunately, we do not need to zero it
2481 * either since on one hand it will either already be zero due
2482 * to both readpage and writepage clearing partial page data
2483 * beyond i_size in which case there is nothing to do or in the
2484 * case of the file being mmap()ped at the same time, POSIX
2485 * specifies that the behaviour is unspecified thus we do not
2486 * have to do anything. This means that in our implementation
2487 * in the rare case that the file is mmap()ped and a write
2488 * occured into the mmap()ped region just beyond the file size
2489 * and writepage has not yet been called to write out the page
2490 * (which would clear the area beyond the file size) and we now
2491 * extend the file size to incorporate this dirty region
2492 * outside the file size, a write of the page would result in
2493 * this data being written to disk instead of being cleared.
2494 * Given both POSIX and the Linux mmap(2) man page specify that
2495 * this corner case is undefined, we choose to leave it like
2496 * that as this is much simpler for us as we cannot lock the
2497 * relevant page now since we are holding too many ntfs locks
2498 * which would result in a lock reversal deadlock.
2499 */
2500 ni->initialized_size = new_size;
2501 write_unlock_irqrestore(&ni->size_lock, flags);
2502 goto unm_done;
2503 }
2504 /* If the above resize failed, this must be an attribute extension. */
2505 BUG_ON(size_change < 0);
2506 /*
2507 * We have to drop all the locks so we can call
2508 * ntfs_attr_make_non_resident(). This could be optimised by try-
2509 * locking the first page cache page and only if that fails dropping
2510 * the locks, locking the page, and redoing all the locking and
2511 * lookups. While this would be a huge optimisation, it is not worth
2512 * it as this is definitely a slow code path as it only ever can happen
2513 * once for any given file.
2514 */
2367 ntfs_attr_put_search_ctx(ctx); 2515 ntfs_attr_put_search_ctx(ctx);
2368 unmap_mft_record(ni); 2516 unmap_mft_record(base_ni);
2369 NInoClearTruncateFailed(ni); 2517 up_write(&ni->runlist.lock);
2370 ntfs_debug("Done."); 2518 /*
2371 return 0; 2519 * Not enough space in the mft record, try to make the attribute
2372err_out: 2520 * non-resident and if successful restart the truncation process.
2373 if (err != -ENOMEM) { 2521 */
2522 err = ntfs_attr_make_non_resident(ni, old_size);
2523 if (likely(!err))
2524 goto retry_truncate;
2525 /*
2526 * Could not make non-resident. If this is due to this not being
2527 * permitted for this attribute type or there not being enough space,
2528 * try to make other attributes non-resident. Otherwise fail.
2529 */
2530 if (unlikely(err != -EPERM && err != -ENOSPC)) {
2531 ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
2532 "type 0x%x, because the conversion from "
2533 "resident to non-resident attribute failed "
2534 "with error code %i.", vi->i_ino,
2535 (unsigned)le32_to_cpu(ni->type), err);
2536 if (err != -ENOMEM)
2537 err = -EIO;
2538 goto conv_err_out;
2539 }
2540 /* TODO: Not implemented from here, abort. */
2541 if (err == -ENOSPC)
2542 ntfs_error(vol->sb, "Not enough space in the mft record/on "
2543 "disk for the non-resident attribute value. "
2544 "This case is not implemented yet.");
2545 else /* if (err == -EPERM) */
2546 ntfs_error(vol->sb, "This attribute type may not be "
2547 "non-resident. This case is not implemented "
2548 "yet.");
2549 err = -EOPNOTSUPP;
2550 goto conv_err_out;
2551#if 0
2552 // TODO: Attempt to make other attributes non-resident.
2553 if (!err)
2554 goto do_resident_extend;
2555 /*
2556 * Both the attribute list attribute and the standard information
2557 * attribute must remain in the base inode. Thus, if this is one of
2558 * these attributes, we have to try to move other attributes out into
2559 * extent mft records instead.
2560 */
2561 if (ni->type == AT_ATTRIBUTE_LIST ||
2562 ni->type == AT_STANDARD_INFORMATION) {
2563 // TODO: Attempt to move other attributes into extent mft
2564 // records.
2565 err = -EOPNOTSUPP;
2566 if (!err)
2567 goto do_resident_extend;
2568 goto err_out;
2569 }
2570 // TODO: Attempt to move this attribute to an extent mft record, but
2571 // only if it is not already the only attribute in an mft record in
2572 // which case there would be nothing to gain.
2573 err = -EOPNOTSUPP;
2574 if (!err)
2575 goto do_resident_extend;
2576 /* There is nothing we can do to make enough space. )-: */
2577 goto err_out;
2578#endif
2579do_non_resident_truncate:
2580 BUG_ON(!NInoNonResident(ni));
2581 if (alloc_change < 0) {
2582 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
2583 if (highest_vcn > 0 &&
2584 old_alloc_size >> vol->cluster_size_bits >
2585 highest_vcn + 1) {
2586 /*
2587 * This attribute has multiple extents. Not yet
2588 * supported.
2589 */
2590 ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
2591 "attribute type 0x%x, because the "
2592 "attribute is highly fragmented (it "
2593 "consists of multiple extents) and "
2594 "this case is not implemented yet.",
2595 vi->i_ino,
2596 (unsigned)le32_to_cpu(ni->type));
2597 err = -EOPNOTSUPP;
2598 goto bad_out;
2599 }
2600 }
2601 /*
2602 * If the size is shrinking, need to reduce the initialized_size and
2603 * the data_size before reducing the allocation.
2604 */
2605 if (size_change < 0) {
2606 /*
2607 * Make the valid size smaller (i_size is already up-to-date).
2608 */
2609 write_lock_irqsave(&ni->size_lock, flags);
2610 if (new_size < ni->initialized_size) {
2611 ni->initialized_size = new_size;
2612 a->data.non_resident.initialized_size =
2613 cpu_to_sle64(new_size);
2614 }
2615 a->data.non_resident.data_size = cpu_to_sle64(new_size);
2616 write_unlock_irqrestore(&ni->size_lock, flags);
2617 flush_dcache_mft_record_page(ctx->ntfs_ino);
2618 mark_mft_record_dirty(ctx->ntfs_ino);
2619 /* If the allocated size is not changing, we are done. */
2620 if (!alloc_change)
2621 goto unm_done;
2622 /*
2623 * If the size is shrinking it makes no sense for the
2624 * allocation to be growing.
2625 */
2626 BUG_ON(alloc_change > 0);
2627 } else /* if (size_change >= 0) */ {
2628 /*
2629 * The file size is growing or staying the same but the
2630 * allocation can be shrinking, growing or staying the same.
2631 */
2632 if (alloc_change > 0) {
2633 /*
2634 * We need to extend the allocation and possibly update
2635 * the data size. If we are updating the data size,
2636 * since we are not touching the initialized_size we do
2637 * not need to worry about the actual data on disk.
2638 * And as far as the page cache is concerned, there
2639 * will be no pages beyond the old data size and any
2640 * partial region in the last page between the old and
2641 * new data size (or the end of the page if the new
2642 * data size is outside the page) does not need to be
2643 * modified as explained above for the resident
2644 * attribute truncate case. To do this, we simply drop
2645 * the locks we hold and leave all the work to our
2646 * friendly helper ntfs_attr_extend_allocation().
2647 */
2648 ntfs_attr_put_search_ctx(ctx);
2649 unmap_mft_record(base_ni);
2650 up_write(&ni->runlist.lock);
2651 err = ntfs_attr_extend_allocation(ni, new_size,
2652 size_change > 0 ? new_size : -1, -1);
2653 /*
2654 * ntfs_attr_extend_allocation() will have done error
2655 * output already.
2656 */
2657 goto done;
2658 }
2659 if (!alloc_change)
2660 goto alloc_done;
2661 }
2662 /* alloc_change < 0 */
2663 /* Free the clusters. */
2664 nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
2665 vol->cluster_size_bits, -1, ctx);
2666 m = ctx->mrec;
2667 a = ctx->attr;
2668 if (unlikely(nr_freed < 0)) {
2669 ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
2670 "%lli). Unmount and run chkdsk to recover "
2671 "the lost cluster(s).", (long long)nr_freed);
2374 NVolSetErrors(vol); 2672 NVolSetErrors(vol);
2673 nr_freed = 0;
2674 }
2675 /* Truncate the runlist. */
2676 err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
2677 new_alloc_size >> vol->cluster_size_bits);
2678 /*
2679 * If the runlist truncation failed and/or the search context is no
2680 * longer valid, we cannot resize the attribute record or build the
2681 * mapping pairs array thus we mark the inode bad so that no access to
2682 * the freed clusters can happen.
2683 */
2684 if (unlikely(err || IS_ERR(m))) {
2685 ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
2686 IS_ERR(m) ?
2687 "restore attribute search context" :
2688 "truncate attribute runlist",
2689 IS_ERR(m) ? PTR_ERR(m) : err, es);
2690 err = -EIO;
2691 goto bad_out;
2692 }
2693 /* Get the size for the shrunk mapping pairs array for the runlist. */
2694 mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
2695 if (unlikely(mp_size <= 0)) {
2696 ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
2697 "attribute type 0x%x, because determining the "
2698 "size for the mapping pairs failed with error "
2699 "code %i.%s", vi->i_ino,
2700 (unsigned)le32_to_cpu(ni->type), mp_size, es);
2701 err = -EIO;
2702 goto bad_out;
2703 }
2704 /*
2705 * Shrink the attribute record for the new mapping pairs array. Note,
2706 * this cannot fail since we are making the attribute smaller thus by
2707 * definition there is enough space to do so.
2708 */
2709 attr_len = le32_to_cpu(a->length);
2710 err = ntfs_attr_record_resize(m, a, mp_size +
2711 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
2712 BUG_ON(err);
2713 /*
2714 * Generate the mapping pairs array directly into the attribute record.
2715 */
2716 err = ntfs_mapping_pairs_build(vol, (u8*)a +
2717 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
2718 mp_size, ni->runlist.rl, 0, -1, NULL);
2719 if (unlikely(err)) {
2720 ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
2721 "attribute type 0x%x, because building the "
2722 "mapping pairs failed with error code %i.%s",
2723 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
2724 err, es);
2725 err = -EIO;
2726 goto bad_out;
2727 }
2728 /* Update the allocated/compressed size as well as the highest vcn. */
2729 a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
2730 vol->cluster_size_bits) - 1);
2731 write_lock_irqsave(&ni->size_lock, flags);
2732 ni->allocated_size = new_alloc_size;
2733 a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
2734 if (NInoSparse(ni) || NInoCompressed(ni)) {
2735 if (nr_freed) {
2736 ni->itype.compressed.size -= nr_freed <<
2737 vol->cluster_size_bits;
2738 BUG_ON(ni->itype.compressed.size < 0);
2739 a->data.non_resident.compressed_size = cpu_to_sle64(
2740 ni->itype.compressed.size);
2741 vi->i_blocks = ni->itype.compressed.size >> 9;
2742 }
2743 } else
2744 vi->i_blocks = new_alloc_size >> 9;
2745 write_unlock_irqrestore(&ni->size_lock, flags);
2746 /*
2747 * We have shrunk the allocation. If this is a shrinking truncate we
2748 * have already dealt with the initialized_size and the data_size above
2749 * and we are done. If the truncate is only changing the allocation
2750 * and not the data_size, we are also done. If this is an extending
2751 * truncate, need to extend the data_size now which is ensured by the
2752 * fact that @size_change is positive.
2753 */
2754alloc_done:
2755 /*
2756 * If the size is growing, need to update it now. If it is shrinking,
2757 * we have already updated it above (before the allocation change).
2758 */
2759 if (size_change > 0)
2760 a->data.non_resident.data_size = cpu_to_sle64(new_size);
2761 /* Ensure the modified mft record is written out. */
2762 flush_dcache_mft_record_page(ctx->ntfs_ino);
2763 mark_mft_record_dirty(ctx->ntfs_ino);
2764unm_done:
2765 ntfs_attr_put_search_ctx(ctx);
2766 unmap_mft_record(base_ni);
2767 up_write(&ni->runlist.lock);
2768done:
2769 /* Update the mtime and ctime on the base inode. */
2770 inode_update_time(VFS_I(base_ni), 1);
2771 if (likely(!err)) {
2772 NInoClearTruncateFailed(ni);
2773 ntfs_debug("Done.");
2774 }
2775 return err;
2776old_bad_out:
2777 old_size = -1;
2778bad_out:
2779 if (err != -ENOMEM && err != -EOPNOTSUPP) {
2375 make_bad_inode(vi); 2780 make_bad_inode(vi);
2781 make_bad_inode(VFS_I(base_ni));
2782 NVolSetErrors(vol);
2376 } 2783 }
2784 if (err != -EOPNOTSUPP)
2785 NInoSetTruncateFailed(ni);
2786 else if (old_size >= 0)
2787 i_size_write(vi, old_size);
2788err_out:
2377 if (ctx) 2789 if (ctx)
2378 ntfs_attr_put_search_ctx(ctx); 2790 ntfs_attr_put_search_ctx(ctx);
2379 if (m) 2791 if (m)
2380 unmap_mft_record(ni); 2792 unmap_mft_record(base_ni);
2381 NInoSetTruncateFailed(ni); 2793 up_write(&ni->runlist.lock);
2794out:
2795 ntfs_debug("Failed. Returning error code %i.", err);
2382 return err; 2796 return err;
2797conv_err_out:
2798 if (err != -ENOMEM && err != -EOPNOTSUPP) {
2799 make_bad_inode(vi);
2800 make_bad_inode(VFS_I(base_ni));
2801 NVolSetErrors(vol);
2802 }
2803 if (err != -EOPNOTSUPP)
2804 NInoSetTruncateFailed(ni);
2805 else
2806 i_size_write(vi, old_size);
2807 goto out;
2383} 2808}
2384 2809
2385/** 2810/**
@@ -2420,8 +2845,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2420 2845
2421 err = inode_change_ok(vi, attr); 2846 err = inode_change_ok(vi, attr);
2422 if (err) 2847 if (err)
2423 return err; 2848 goto out;
2424
2425 /* We do not support NTFS ACLs yet. */ 2849 /* We do not support NTFS ACLs yet. */
2426 if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { 2850 if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
2427 ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " 2851 ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
@@ -2429,14 +2853,22 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2429 err = -EOPNOTSUPP; 2853 err = -EOPNOTSUPP;
2430 goto out; 2854 goto out;
2431 } 2855 }
2432
2433 if (ia_valid & ATTR_SIZE) { 2856 if (ia_valid & ATTR_SIZE) {
2434 if (attr->ia_size != i_size_read(vi)) { 2857 if (attr->ia_size != i_size_read(vi)) {
2435 ntfs_warning(vi->i_sb, "Changes in inode size are not " 2858 ntfs_inode *ni = NTFS_I(vi);
2436 "supported yet, ignoring."); 2859 /*
2437 err = -EOPNOTSUPP; 2860 * FIXME: For now we do not support resizing of
2438 // TODO: Implement... 2861 * compressed or encrypted files yet.
2439 // err = vmtruncate(vi, attr->ia_size); 2862 */
2863 if (NInoCompressed(ni) || NInoEncrypted(ni)) {
2864 ntfs_warning(vi->i_sb, "Changes in inode size "
2865 "are not supported yet for "
2866 "%s files, ignoring.",
2867 NInoCompressed(ni) ?
2868 "compressed" : "encrypted");
2869 err = -EOPNOTSUPP;
2870 } else
2871 err = vmtruncate(vi, attr->ia_size);
2440 if (err || ia_valid == ATTR_SIZE) 2872 if (err || ia_valid == ATTR_SIZE)
2441 goto out; 2873 goto out;
2442 } else { 2874 } else {
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 5c248d404f05..f5678d5d7919 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -1021,10 +1021,17 @@ enum {
1021 FILE_NAME_POSIX = 0x00, 1021 FILE_NAME_POSIX = 0x00,
1022 /* This is the largest namespace. It is case sensitive and allows all 1022 /* This is the largest namespace. It is case sensitive and allows all
1023 Unicode characters except for: '\0' and '/'. Beware that in 1023 Unicode characters except for: '\0' and '/'. Beware that in
1024 WinNT/2k files which eg have the same name except for their case 1024 WinNT/2k/2003 by default files which eg have the same name except
1025 will not be distinguished by the standard utilities and thus a "del 1025 for their case will not be distinguished by the standard utilities
1026 filename" will delete both "filename" and "fileName" without 1026 and thus a "del filename" will delete both "filename" and "fileName"
1027 warning. */ 1027 without warning. However if for example Services For Unix (SFU) are
1028 installed and the case sensitive option was enabled at installation
1029 time, then you can create/access/delete such files.
1030 Note that even SFU places restrictions on the filenames beyond the
1031 '\0' and '/' and in particular the following set of characters is
1032 not allowed: '"', '/', '<', '>', '\'. All other characters,
1033 including the ones no allowed in WIN32 namespace are allowed.
1034 Tested with SFU 3.5 (this is now free) running on Windows XP. */
1028 FILE_NAME_WIN32 = 0x01, 1035 FILE_NAME_WIN32 = 0x01,
1029 /* The standard WinNT/2k NTFS long filenames. Case insensitive. All 1036 /* The standard WinNT/2k NTFS long filenames. Case insensitive. All
1030 Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', 1037 Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
@@ -2367,7 +2374,9 @@ typedef struct {
2367 * Extended attribute flags (8-bit). 2374 * Extended attribute flags (8-bit).
2368 */ 2375 */
2369enum { 2376enum {
2370 NEED_EA = 0x80 2377 NEED_EA = 0x80 /* If set the file to which the EA belongs
2378 cannot be interpreted without understanding
2379 the associates extended attributes. */
2371} __attribute__ ((__packed__)); 2380} __attribute__ ((__packed__));
2372 2381
2373typedef u8 EA_FLAGS; 2382typedef u8 EA_FLAGS;
@@ -2375,20 +2384,20 @@ typedef u8 EA_FLAGS;
2375/* 2384/*
2376 * Attribute: Extended attribute (EA) (0xe0). 2385 * Attribute: Extended attribute (EA) (0xe0).
2377 * 2386 *
2378 * NOTE: Always non-resident. (Is this true?) 2387 * NOTE: Can be resident or non-resident.
2379 * 2388 *
2380 * Like the attribute list and the index buffer list, the EA attribute value is 2389 * Like the attribute list and the index buffer list, the EA attribute value is
2381 * a sequence of EA_ATTR variable length records. 2390 * a sequence of EA_ATTR variable length records.
2382 *
2383 * FIXME: It appears weird that the EA name is not unicode. Is it true?
2384 */ 2391 */
2385typedef struct { 2392typedef struct {
2386 le32 next_entry_offset; /* Offset to the next EA_ATTR. */ 2393 le32 next_entry_offset; /* Offset to the next EA_ATTR. */
2387 EA_FLAGS flags; /* Flags describing the EA. */ 2394 EA_FLAGS flags; /* Flags describing the EA. */
2388 u8 ea_name_length; /* Length of the name of the EA in bytes. */ 2395 u8 ea_name_length; /* Length of the name of the EA in bytes
2396 excluding the '\0' byte terminator. */
2389 le16 ea_value_length; /* Byte size of the EA's value. */ 2397 le16 ea_value_length; /* Byte size of the EA's value. */
2390 u8 ea_name[0]; /* Name of the EA. */ 2398 u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not
2391 u8 ea_value[0]; /* The value of the EA. Immediately follows 2399 Unicode and it is zero terminated. */
2400 u8 ea_value[0]; /* The value of the EA. Immediately follows
2392 the name. */ 2401 the name. */
2393} __attribute__ ((__packed__)) EA_ATTR; 2402} __attribute__ ((__packed__)) EA_ATTR;
2394 2403
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 5af3bf0b7eee..29cabf93d2d2 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -76,6 +76,7 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
76 * @count: number of clusters to allocate 76 * @count: number of clusters to allocate
77 * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) 77 * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none)
78 * @zone: zone from which to allocate the clusters 78 * @zone: zone from which to allocate the clusters
79 * @is_extension: if TRUE, this is an attribute extension
79 * 80 *
80 * Allocate @count clusters preferably starting at cluster @start_lcn or at the 81 * Allocate @count clusters preferably starting at cluster @start_lcn or at the
81 * current allocator position if @start_lcn is -1, on the mounted ntfs volume 82 * current allocator position if @start_lcn is -1, on the mounted ntfs volume
@@ -86,6 +87,13 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
86 * @start_vcn specifies the vcn of the first allocated cluster. This makes 87 * @start_vcn specifies the vcn of the first allocated cluster. This makes
87 * merging the resulting runlist with the old runlist easier. 88 * merging the resulting runlist with the old runlist easier.
88 * 89 *
90 * If @is_extension is TRUE, the caller is allocating clusters to extend an
91 * attribute and if it is FALSE, the caller is allocating clusters to fill a
92 * hole in an attribute. Practically the difference is that if @is_extension
93 * is TRUE the returned runlist will be terminated with LCN_ENOENT and if
94 * @is_extension is FALSE the runlist will be terminated with
95 * LCN_RL_NOT_MAPPED.
96 *
89 * You need to check the return value with IS_ERR(). If this is false, the 97 * You need to check the return value with IS_ERR(). If this is false, the
90 * function was successful and the return value is a runlist describing the 98 * function was successful and the return value is a runlist describing the
91 * allocated cluster(s). If IS_ERR() is true, the function failed and 99 * allocated cluster(s). If IS_ERR() is true, the function failed and
@@ -137,7 +145,8 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
137 */ 145 */
138runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, 146runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
139 const s64 count, const LCN start_lcn, 147 const s64 count, const LCN start_lcn,
140 const NTFS_CLUSTER_ALLOCATION_ZONES zone) 148 const NTFS_CLUSTER_ALLOCATION_ZONES zone,
149 const BOOL is_extension)
141{ 150{
142 LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; 151 LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
143 LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; 152 LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
@@ -310,7 +319,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
310 continue; 319 continue;
311 } 320 }
312 bit = 1 << (lcn & 7); 321 bit = 1 << (lcn & 7);
313 ntfs_debug("bit %i.", bit); 322 ntfs_debug("bit 0x%x.", bit);
314 /* If the bit is already set, go onto the next one. */ 323 /* If the bit is already set, go onto the next one. */
315 if (*byte & bit) { 324 if (*byte & bit) {
316 lcn++; 325 lcn++;
@@ -729,7 +738,7 @@ out:
729 /* Add runlist terminator element. */ 738 /* Add runlist terminator element. */
730 if (likely(rl)) { 739 if (likely(rl)) {
731 rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; 740 rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
732 rl[rlpos].lcn = LCN_RL_NOT_MAPPED; 741 rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
733 rl[rlpos].length = 0; 742 rl[rlpos].length = 0;
734 } 743 }
735 if (likely(page && !IS_ERR(page))) { 744 if (likely(page && !IS_ERR(page))) {
@@ -782,6 +791,7 @@ out:
782 * @ni: ntfs inode whose runlist describes the clusters to free 791 * @ni: ntfs inode whose runlist describes the clusters to free
783 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters 792 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
784 * @count: number of clusters to free or -1 for all clusters 793 * @count: number of clusters to free or -1 for all clusters
794 * @ctx: active attribute search context if present or NULL if not
785 * @is_rollback: true if this is a rollback operation 795 * @is_rollback: true if this is a rollback operation
786 * 796 *
787 * Free @count clusters starting at the cluster @start_vcn in the runlist 797 * Free @count clusters starting at the cluster @start_vcn in the runlist
@@ -791,15 +801,39 @@ out:
791 * deallocated. Thus, to completely free all clusters in a runlist, use 801 * deallocated. Thus, to completely free all clusters in a runlist, use
792 * @start_vcn = 0 and @count = -1. 802 * @start_vcn = 0 and @count = -1.
793 * 803 *
804 * If @ctx is specified, it is an active search context of @ni and its base mft
805 * record. This is needed when __ntfs_cluster_free() encounters unmapped
806 * runlist fragments and allows their mapping. If you do not have the mft
807 * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
808 * perform the necessary mapping and unmapping.
809 *
810 * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
811 * before returning. Thus, @ctx will be left pointing to the same attribute on
812 * return as on entry. However, the actual pointers in @ctx may point to
813 * different memory locations on return, so you must remember to reset any
814 * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
815 * you will probably want to do:
816 * m = ctx->mrec;
817 * a = ctx->attr;
818 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
819 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
820 *
794 * @is_rollback should always be FALSE, it is for internal use to rollback 821 * @is_rollback should always be FALSE, it is for internal use to rollback
795 * errors. You probably want to use ntfs_cluster_free() instead. 822 * errors. You probably want to use ntfs_cluster_free() instead.
796 * 823 *
797 * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller 824 * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
798 * has to deal with it later. 825 * remove from the runlist or mark sparse the freed runs later.
799 * 826 *
800 * Return the number of deallocated clusters (not counting sparse ones) on 827 * Return the number of deallocated clusters (not counting sparse ones) on
801 * success and -errno on error. 828 * success and -errno on error.
802 * 829 *
830 * WARNING: If @ctx is supplied, regardless of whether success or failure is
831 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
832 * is no longer valid, i.e. you need to either call
833 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
834 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
835 * why the mapping of the old inode failed.
836 *
803 * Locking: - The runlist described by @ni must be locked for writing on entry 837 * Locking: - The runlist described by @ni must be locked for writing on entry
804 * and is locked on return. Note the runlist may be modified when 838 * and is locked on return. Note the runlist may be modified when
805 * needed runlist fragments need to be mapped. 839 * needed runlist fragments need to be mapped.
@@ -807,9 +841,13 @@ out:
807 * on return. 841 * on return.
808 * - This function takes the volume lcn bitmap lock for writing and 842 * - This function takes the volume lcn bitmap lock for writing and
809 * modifies the bitmap contents. 843 * modifies the bitmap contents.
844 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
845 * entry and it will be left unmapped on return.
846 * - If @ctx is not NULL, the base mft record must be mapped on entry
847 * and it will be left mapped on return.
810 */ 848 */
811s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, 849s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
812 const BOOL is_rollback) 850 ntfs_attr_search_ctx *ctx, const BOOL is_rollback)
813{ 851{
814 s64 delta, to_free, total_freed, real_freed; 852 s64 delta, to_free, total_freed, real_freed;
815 ntfs_volume *vol; 853 ntfs_volume *vol;
@@ -839,7 +877,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
839 877
840 total_freed = real_freed = 0; 878 total_freed = real_freed = 0;
841 879
842 rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, TRUE); 880 rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
843 if (IS_ERR(rl)) { 881 if (IS_ERR(rl)) {
844 if (!is_rollback) 882 if (!is_rollback)
845 ntfs_error(vol->sb, "Failed to find first runlist " 883 ntfs_error(vol->sb, "Failed to find first runlist "
@@ -893,7 +931,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
893 931
894 /* Attempt to map runlist. */ 932 /* Attempt to map runlist. */
895 vcn = rl->vcn; 933 vcn = rl->vcn;
896 rl = ntfs_attr_find_vcn_nolock(ni, vcn, TRUE); 934 rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
897 if (IS_ERR(rl)) { 935 if (IS_ERR(rl)) {
898 err = PTR_ERR(rl); 936 err = PTR_ERR(rl);
899 if (!is_rollback) 937 if (!is_rollback)
@@ -961,7 +999,7 @@ err_out:
961 * If rollback fails, set the volume errors flag, emit an error 999 * If rollback fails, set the volume errors flag, emit an error
962 * message, and return the error code. 1000 * message, and return the error code.
963 */ 1001 */
964 delta = __ntfs_cluster_free(ni, start_vcn, total_freed, TRUE); 1002 delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, TRUE);
965 if (delta < 0) { 1003 if (delta < 0) {
966 ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " 1004 ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving "
967 "inconsistent metadata! Unmount and run " 1005 "inconsistent metadata! Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index a6a8827882e7..72cbca7003b2 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -27,6 +27,7 @@
27 27
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30#include "attrib.h"
30#include "types.h" 31#include "types.h"
31#include "inode.h" 32#include "inode.h"
32#include "runlist.h" 33#include "runlist.h"
@@ -41,16 +42,18 @@ typedef enum {
41 42
42extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, 43extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
43 const VCN start_vcn, const s64 count, const LCN start_lcn, 44 const VCN start_vcn, const s64 count, const LCN start_lcn,
44 const NTFS_CLUSTER_ALLOCATION_ZONES zone); 45 const NTFS_CLUSTER_ALLOCATION_ZONES zone,
46 const BOOL is_extension);
45 47
46extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, 48extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
47 s64 count, const BOOL is_rollback); 49 s64 count, ntfs_attr_search_ctx *ctx, const BOOL is_rollback);
48 50
49/** 51/**
50 * ntfs_cluster_free - free clusters on an ntfs volume 52 * ntfs_cluster_free - free clusters on an ntfs volume
51 * @ni: ntfs inode whose runlist describes the clusters to free 53 * @ni: ntfs inode whose runlist describes the clusters to free
52 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters 54 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
53 * @count: number of clusters to free or -1 for all clusters 55 * @count: number of clusters to free or -1 for all clusters
56 * @ctx: active attribute search context if present or NULL if not
54 * 57 *
55 * Free @count clusters starting at the cluster @start_vcn in the runlist 58 * Free @count clusters starting at the cluster @start_vcn in the runlist
56 * described by the ntfs inode @ni. 59 * described by the ntfs inode @ni.
@@ -59,12 +62,36 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
59 * deallocated. Thus, to completely free all clusters in a runlist, use 62 * deallocated. Thus, to completely free all clusters in a runlist, use
60 * @start_vcn = 0 and @count = -1. 63 * @start_vcn = 0 and @count = -1.
61 * 64 *
62 * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller 65 * If @ctx is specified, it is an active search context of @ni and its base mft
63 * has to deal with it later. 66 * record. This is needed when ntfs_cluster_free() encounters unmapped runlist
67 * fragments and allows their mapping. If you do not have the mft record
68 * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
69 * the necessary mapping and unmapping.
70 *
71 * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
72 * before returning. Thus, @ctx will be left pointing to the same attribute on
73 * return as on entry. However, the actual pointers in @ctx may point to
74 * different memory locations on return, so you must remember to reset any
75 * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
76 * you will probably want to do:
77 * m = ctx->mrec;
78 * a = ctx->attr;
79 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
80 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
81 *
82 * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
83 * from the runlist or mark sparse the freed runs later.
64 * 84 *
65 * Return the number of deallocated clusters (not counting sparse ones) on 85 * Return the number of deallocated clusters (not counting sparse ones) on
66 * success and -errno on error. 86 * success and -errno on error.
67 * 87 *
88 * WARNING: If @ctx is supplied, regardless of whether success or failure is
89 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
90 * is no longer valid, i.e. you need to either call
91 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
92 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
93 * why the mapping of the old inode failed.
94 *
68 * Locking: - The runlist described by @ni must be locked for writing on entry 95 * Locking: - The runlist described by @ni must be locked for writing on entry
69 * and is locked on return. Note the runlist may be modified when 96 * and is locked on return. Note the runlist may be modified when
70 * needed runlist fragments need to be mapped. 97 * needed runlist fragments need to be mapped.
@@ -72,11 +99,15 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
72 * on return. 99 * on return.
73 * - This function takes the volume lcn bitmap lock for writing and 100 * - This function takes the volume lcn bitmap lock for writing and
74 * modifies the bitmap contents. 101 * modifies the bitmap contents.
102 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
103 * entry and it will be left unmapped on return.
104 * - If @ctx is not NULL, the base mft record must be mapped on entry
105 * and it will be left mapped on return.
75 */ 106 */
76static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, 107static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
77 s64 count) 108 s64 count, ntfs_attr_search_ctx *ctx)
78{ 109{
79 return __ntfs_cluster_free(ni, start_vcn, count, FALSE); 110 return __ntfs_cluster_free(ni, start_vcn, count, ctx, FALSE);
80} 111}
81 112
82extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, 113extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index 590887b943f5..e38e402e4103 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -39,8 +39,7 @@
39 * If there was insufficient memory to complete the request, return NULL. 39 * If there was insufficient memory to complete the request, return NULL.
40 * Depending on @gfp_mask the allocation may be guaranteed to succeed. 40 * Depending on @gfp_mask the allocation may be guaranteed to succeed.
41 */ 41 */
42static inline void *__ntfs_malloc(unsigned long size, 42static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
43 gfp_t gfp_mask)
44{ 43{
45 if (likely(size <= PAGE_SIZE)) { 44 if (likely(size <= PAGE_SIZE)) {
46 BUG_ON(!size); 45 BUG_ON(!size);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b011369b5956..0c65cbb8c5cf 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -49,7 +49,8 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
49 ntfs_volume *vol = ni->vol; 49 ntfs_volume *vol = ni->vol;
50 struct inode *mft_vi = vol->mft_ino; 50 struct inode *mft_vi = vol->mft_ino;
51 struct page *page; 51 struct page *page;
52 unsigned long index, ofs, end_index; 52 unsigned long index, end_index;
53 unsigned ofs;
53 54
54 BUG_ON(ni->page); 55 BUG_ON(ni->page);
55 /* 56 /*
@@ -1308,7 +1309,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1308 ll = mftbmp_ni->allocated_size; 1309 ll = mftbmp_ni->allocated_size;
1309 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 1310 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1310 rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, 1311 rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
1311 (ll - 1) >> vol->cluster_size_bits, TRUE); 1312 (ll - 1) >> vol->cluster_size_bits, NULL);
1312 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { 1313 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1313 up_write(&mftbmp_ni->runlist.lock); 1314 up_write(&mftbmp_ni->runlist.lock);
1314 ntfs_error(vol->sb, "Failed to determine last allocated " 1315 ntfs_error(vol->sb, "Failed to determine last allocated "
@@ -1354,7 +1355,8 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1354 up_write(&vol->lcnbmp_lock); 1355 up_write(&vol->lcnbmp_lock);
1355 ntfs_unmap_page(page); 1356 ntfs_unmap_page(page);
1356 /* Allocate a cluster from the DATA_ZONE. */ 1357 /* Allocate a cluster from the DATA_ZONE. */
1357 rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE); 1358 rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
1359 TRUE);
1358 if (IS_ERR(rl2)) { 1360 if (IS_ERR(rl2)) {
1359 up_write(&mftbmp_ni->runlist.lock); 1361 up_write(&mftbmp_ni->runlist.lock);
1360 ntfs_error(vol->sb, "Failed to allocate a cluster for " 1362 ntfs_error(vol->sb, "Failed to allocate a cluster for "
@@ -1738,7 +1740,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1738 ll = mft_ni->allocated_size; 1740 ll = mft_ni->allocated_size;
1739 read_unlock_irqrestore(&mft_ni->size_lock, flags); 1741 read_unlock_irqrestore(&mft_ni->size_lock, flags);
1740 rl = ntfs_attr_find_vcn_nolock(mft_ni, 1742 rl = ntfs_attr_find_vcn_nolock(mft_ni,
1741 (ll - 1) >> vol->cluster_size_bits, TRUE); 1743 (ll - 1) >> vol->cluster_size_bits, NULL);
1742 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { 1744 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1743 up_write(&mft_ni->runlist.lock); 1745 up_write(&mft_ni->runlist.lock);
1744 ntfs_error(vol->sb, "Failed to determine last allocated " 1746 ntfs_error(vol->sb, "Failed to determine last allocated "
@@ -1779,7 +1781,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1779 nr > min_nr ? "default" : "minimal", (long long)nr); 1781 nr > min_nr ? "default" : "minimal", (long long)nr);
1780 old_last_vcn = rl[1].vcn; 1782 old_last_vcn = rl[1].vcn;
1781 do { 1783 do {
1782 rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE); 1784 rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
1785 TRUE);
1783 if (likely(!IS_ERR(rl2))) 1786 if (likely(!IS_ERR(rl2)))
1784 break; 1787 break;
1785 if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { 1788 if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
@@ -1951,20 +1954,21 @@ restore_undo_alloc:
1951 NVolSetErrors(vol); 1954 NVolSetErrors(vol);
1952 return ret; 1955 return ret;
1953 } 1956 }
1954 a = ctx->attr; 1957 ctx->attr->data.non_resident.highest_vcn =
1955 a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1); 1958 cpu_to_sle64(old_last_vcn - 1);
1956undo_alloc: 1959undo_alloc:
1957 if (ntfs_cluster_free(mft_ni, old_last_vcn, -1) < 0) { 1960 if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
1958 ntfs_error(vol->sb, "Failed to free clusters from mft data " 1961 ntfs_error(vol->sb, "Failed to free clusters from mft data "
1959 "attribute.%s", es); 1962 "attribute.%s", es);
1960 NVolSetErrors(vol); 1963 NVolSetErrors(vol);
1961 } 1964 }
1965 a = ctx->attr;
1962 if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { 1966 if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
1963 ntfs_error(vol->sb, "Failed to truncate mft data attribute " 1967 ntfs_error(vol->sb, "Failed to truncate mft data attribute "
1964 "runlist.%s", es); 1968 "runlist.%s", es);
1965 NVolSetErrors(vol); 1969 NVolSetErrors(vol);
1966 } 1970 }
1967 if (mp_rebuilt) { 1971 if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
1968 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1972 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1969 a->data.non_resident.mapping_pairs_offset), 1973 a->data.non_resident.mapping_pairs_offset),
1970 old_alen - le16_to_cpu( 1974 old_alen - le16_to_cpu(
@@ -1981,6 +1985,10 @@ undo_alloc:
1981 } 1985 }
1982 flush_dcache_mft_record_page(ctx->ntfs_ino); 1986 flush_dcache_mft_record_page(ctx->ntfs_ino);
1983 mark_mft_record_dirty(ctx->ntfs_ino); 1987 mark_mft_record_dirty(ctx->ntfs_ino);
1988 } else if (IS_ERR(ctx->mrec)) {
1989 ntfs_error(vol->sb, "Failed to restore attribute search "
1990 "context.%s", es);
1991 NVolSetErrors(vol);
1984 } 1992 }
1985 if (ctx) 1993 if (ctx)
1986 ntfs_attr_put_search_ctx(ctx); 1994 ntfs_attr_put_search_ctx(ctx);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 453d0d51ea4b..6c16db9e1a8a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1447,7 +1447,7 @@ not_enabled:
1447 if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { 1447 if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
1448 ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " 1448 ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
1449 "attribute (size is 0x%llx but should be at " 1449 "attribute (size is 0x%llx but should be at "
1450 "least 0x%x bytes).", i_size_read(tmp_ino), 1450 "least 0x%zx bytes).", i_size_read(tmp_ino),
1451 sizeof(USN_HEADER)); 1451 sizeof(USN_HEADER));
1452 return FALSE; 1452 return FALSE;
1453 } 1453 }