aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig147
-rw-r--r--fs/Makefile1
-rw-r--r--fs/compat_ioctl.c6
-rw-r--r--fs/configfs/configfs_internal.h4
-rw-r--r--fs/configfs/dir.c147
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/symlink.c16
-rw-r--r--fs/dlm/config.c45
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/lockd/clntproc.c8
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c7
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/nfs/callback.c34
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c88
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c155
-rw-r--r--fs/nfs/inode.c79
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/iostat.h119
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3proc.c275
-rw-r--r--fs/nfs/nfs4proc.c265
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfsroot.c10
-rw-r--r--fs/nfs/proc.c28
-rw-r--r--fs/nfs/super.c882
-rw-r--r--fs/nfs/write.c322
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/ocfs2/aops.c13
-rw-r--r--fs/ocfs2/cluster/heartbeat.c17
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/cluster/nodemanager.c45
-rw-r--r--fs/ocfs2/dlmglue.c122
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/ocfs2/ocfs2.h12
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/stack_user.c19
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ubifs/Kconfig72
-rw-r--r--fs/ubifs/Makefile9
-rw-r--r--fs/ubifs/budget.c731
-rw-r--r--fs/ubifs/commit.c677
-rw-r--r--fs/ubifs/compress.c253
-rw-r--r--fs/ubifs/debug.c2289
-rw-r--r--fs/ubifs/debug.h403
-rw-r--r--fs/ubifs/dir.c1240
-rw-r--r--fs/ubifs/file.c1275
-rw-r--r--fs/ubifs/find.c975
-rw-r--r--fs/ubifs/gc.c773
-rw-r--r--fs/ubifs/io.c914
-rw-r--r--fs/ubifs/ioctl.c204
-rw-r--r--fs/ubifs/journal.c1387
-rw-r--r--fs/ubifs/key.h533
-rw-r--r--fs/ubifs/log.c805
-rw-r--r--fs/ubifs/lprops.c1357
-rw-r--r--fs/ubifs/lpt.c2243
-rw-r--r--fs/ubifs/lpt_commit.c1648
-rw-r--r--fs/ubifs/master.c387
-rw-r--r--fs/ubifs/misc.h342
-rw-r--r--fs/ubifs/orphan.c958
-rw-r--r--fs/ubifs/recovery.c1519
-rw-r--r--fs/ubifs/replay.c1075
-rw-r--r--fs/ubifs/sb.c629
-rw-r--r--fs/ubifs/scan.c362
-rw-r--r--fs/ubifs/shrinker.c322
-rw-r--r--fs/ubifs/super.c1951
-rw-r--r--fs/ubifs/tnc.c2956
-rw-r--r--fs/ubifs/tnc_commit.c1103
-rw-r--r--fs/ubifs/tnc_misc.c494
-rw-r--r--fs/ubifs/ubifs-media.h745
-rw-r--r--fs/ubifs/ubifs.h1649
-rw-r--r--fs/ubifs/xattr.c581
76 files changed, 34639 insertions, 1169 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 313b2e06ded5..37db79a2ff95 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -470,6 +470,14 @@ config OCFS2_FS_USERSPACE_CLUSTER
470 It is safe to say Y, as the clustering method is run-time 470 It is safe to say Y, as the clustering method is run-time
471 selectable. 471 selectable.
472 472
473config OCFS2_FS_STATS
474 bool "OCFS2 statistics"
475 depends on OCFS2_FS
476 default y
477 help
478 This option allows some fs statistics to be captured. Enabling
479 this option may increase the memory consumption.
480
473config OCFS2_DEBUG_MASKLOG 481config OCFS2_DEBUG_MASKLOG
474 bool "OCFS2 logging support" 482 bool "OCFS2 logging support"
475 depends on OCFS2_FS 483 depends on OCFS2_FS
@@ -1375,6 +1383,9 @@ config JFFS2_CMODE_FAVOURLZO
1375 1383
1376endchoice 1384endchoice
1377 1385
1386# UBIFS File system configuration
1387source "fs/ubifs/Kconfig"
1388
1378config CRAMFS 1389config CRAMFS
1379 tristate "Compressed ROM file system support (cramfs)" 1390 tristate "Compressed ROM file system support (cramfs)"
1380 depends on BLOCK 1391 depends on BLOCK
@@ -1544,10 +1555,6 @@ config UFS_FS
1544 The recently released UFS2 variant (used in FreeBSD 5.x) is 1555 The recently released UFS2 variant (used in FreeBSD 5.x) is
1545 READ-ONLY supported. 1556 READ-ONLY supported.
1546 1557
1547 If you only intend to mount files from some other Unix over the
1548 network using NFS, you don't need the UFS file system support (but
1549 you need NFS file system support obviously).
1550
1551 Note that this option is generally not needed for floppies, since a 1558 Note that this option is generally not needed for floppies, since a
1552 good portable way to transport files and directories between unixes 1559 good portable way to transport files and directories between unixes
1553 (and even other operating systems) is given by the tar program ("man 1560 (and even other operating systems) is given by the tar program ("man
@@ -1587,6 +1594,7 @@ menuconfig NETWORK_FILESYSTEMS
1587 Say Y here to get to see options for network filesystems and 1594 Say Y here to get to see options for network filesystems and
1588 filesystem-related networking code, such as NFS daemon and 1595 filesystem-related networking code, such as NFS daemon and
1589 RPCSEC security modules. 1596 RPCSEC security modules.
1597
1590 This option alone does not add any kernel code. 1598 This option alone does not add any kernel code.
1591 1599
1592 If you say N, all options in this submenu will be skipped and 1600 If you say N, all options in this submenu will be skipped and
@@ -1595,76 +1603,92 @@ menuconfig NETWORK_FILESYSTEMS
1595if NETWORK_FILESYSTEMS 1603if NETWORK_FILESYSTEMS
1596 1604
1597config NFS_FS 1605config NFS_FS
1598 tristate "NFS file system support" 1606 tristate "NFS client support"
1599 depends on INET 1607 depends on INET
1600 select LOCKD 1608 select LOCKD
1601 select SUNRPC 1609 select SUNRPC
1602 select NFS_ACL_SUPPORT if NFS_V3_ACL 1610 select NFS_ACL_SUPPORT if NFS_V3_ACL
1603 help 1611 help
1604 If you are connected to some other (usually local) Unix computer 1612 Choose Y here if you want to access files residing on other
1605 (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing 1613 computers using Sun's Network File System protocol. To compile
1606 on that computer (the NFS server) using the Network File Sharing 1614 this file system support as a module, choose M here: the module
1607 protocol, say Y. "Mounting files" means that the client can access 1615 will be called nfs.
1608 the files with usual UNIX commands as if they were sitting on the
1609 client's hard disk. For this to work, the server must run the
1610 programs nfsd and mountd (but does not need to have NFS file system
1611 support enabled in its kernel). NFS is explained in the Network
1612 Administrator's Guide, available from
1613 <http://www.tldp.org/docs.html#guide>, on its man page: "man
1614 nfs", and in the NFS-HOWTO.
1615
1616 A superior but less widely used alternative to NFS is provided by
1617 the Coda file system; see "Coda file system support" below.
1618 1616
1619 If you say Y here, you should have said Y to TCP/IP networking also. 1617 To mount file systems exported by NFS servers, you also need to
1620 This option would enlarge your kernel by about 27 KB. 1618 install the user space mount.nfs command which can be found in
1619 the Linux nfs-utils package, available from http://linux-nfs.org/.
1620 Information about using the mount command is available in the
1621 mount(8) man page. More detail about the Linux NFS client
1622 implementation is available via the nfs(5) man page.
1621 1623
1622 To compile this file system support as a module, choose M here: the 1624 Below you can choose which versions of the NFS protocol are
1623 module will be called nfs. 1625 available in the kernel to mount NFS servers. Support for NFS
1626 version 2 (RFC 1094) is always available when NFS_FS is selected.
1624 1627
1625 If you are configuring a diskless machine which will mount its root 1628 To configure a system which mounts its root file system via NFS
1626 file system over NFS at boot time, say Y here and to "Kernel 1629 at boot time, say Y here, select "Kernel level IP
1627 level IP autoconfiguration" above and to "Root file system on NFS" 1630 autoconfiguration" in the NETWORK menu, and select "Root file
1628 below. You cannot compile this driver as a module in this case. 1631 system on NFS" below. You cannot compile this file system as a
1629 There are two packages designed for booting diskless machines over 1632 module in this case.
1630 the net: netboot, available from
1631 <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
1632 available from <http://ftp1.sourceforge.net/etherboot/>.
1633 1633
1634 If you don't know what all this is about, say N. 1634 If unsure, say N.
1635 1635
1636config NFS_V3 1636config NFS_V3
1637 bool "Provide NFSv3 client support" 1637 bool "NFS client support for NFS version 3"
1638 depends on NFS_FS 1638 depends on NFS_FS
1639 help 1639 help
1640 Say Y here if you want your NFS client to be able to speak version 1640 This option enables support for version 3 of the NFS protocol
1641 3 of the NFS protocol. 1641 (RFC 1813) in the kernel's NFS client.
1642 1642
1643 If unsure, say Y. 1643 If unsure, say Y.
1644 1644
1645config NFS_V3_ACL 1645config NFS_V3_ACL
1646 bool "Provide client support for the NFSv3 ACL protocol extension" 1646 bool "NFS client support for the NFSv3 ACL protocol extension"
1647 depends on NFS_V3 1647 depends on NFS_V3
1648 help 1648 help
1649 Implement the NFSv3 ACL protocol extension for manipulating POSIX 1649 Some NFS servers support an auxiliary NFSv3 ACL protocol that
1650 Access Control Lists. The server should also be compiled with 1650 Sun added to Solaris but never became an official part of the
1651 the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option. 1651 NFS version 3 protocol. This protocol extension allows
1652 applications on NFS clients to manipulate POSIX Access Control
1653 Lists on files residing on NFS servers. NFS servers enforce
1654 ACLs on local files whether this protocol is available or not.
1655
1656 Choose Y here if your NFS server supports the Solaris NFSv3 ACL
1657 protocol extension and you want your NFS client to allow
1658 applications to access and modify ACLs on files on the server.
1659
1660 Most NFS servers don't support the Solaris NFSv3 ACL protocol
1661 extension. You can choose N here or specify the "noacl" mount
1662 option to prevent your NFS client from trying to use the NFSv3
1663 ACL protocol.
1652 1664
1653 If unsure, say N. 1665 If unsure, say N.
1654 1666
1655config NFS_V4 1667config NFS_V4
1656 bool "Provide NFSv4 client support (EXPERIMENTAL)" 1668 bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
1657 depends on NFS_FS && EXPERIMENTAL 1669 depends on NFS_FS && EXPERIMENTAL
1658 select RPCSEC_GSS_KRB5 1670 select RPCSEC_GSS_KRB5
1659 help 1671 help
1660 Say Y here if you want your NFS client to be able to speak the newer 1672 This option enables support for version 4 of the NFS protocol
1661 version 4 of the NFS protocol. 1673 (RFC 3530) in the kernel's NFS client.
1662 1674
1663 Note: Requires auxiliary userspace daemons which may be found on 1675 To mount NFS servers using NFSv4, you also need to install user
1664 http://www.citi.umich.edu/projects/nfsv4/ 1676 space programs which can be found in the Linux nfs-utils package,
1677 available from http://linux-nfs.org/.
1665 1678
1666 If unsure, say N. 1679 If unsure, say N.
1667 1680
1681config ROOT_NFS
1682 bool "Root file system on NFS"
1683 depends on NFS_FS=y && IP_PNP
1684 help
1685 If you want your system to mount its root file system via NFS,
1686 choose Y here. This is common practice for managing systems
1687 without local permanent storage. For details, read
1688 <file:Documentation/filesystems/nfsroot.txt>.
1689
1690 Most people say N here.
1691
1668config NFSD 1692config NFSD
1669 tristate "NFS server support" 1693 tristate "NFS server support"
1670 depends on INET 1694 depends on INET
@@ -1746,20 +1770,6 @@ config NFSD_V4
1746 1770
1747 If unsure, say N. 1771 If unsure, say N.
1748 1772
1749config ROOT_NFS
1750 bool "Root file system on NFS"
1751 depends on NFS_FS=y && IP_PNP
1752 help
1753 If you want your Linux box to mount its whole root file system (the
1754 one containing the directory /) from some other computer over the
1755 net via NFS (presumably because your box doesn't have a hard disk),
1756 say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
1757 details. It is likely that in this case, you also want to say Y to
1758 "Kernel level IP autoconfiguration" so that your box can discover
1759 its network address at boot time.
1760
1761 Most people say N here.
1762
1763config LOCKD 1773config LOCKD
1764 tristate 1774 tristate
1765 1775
@@ -1800,27 +1810,6 @@ config SUNRPC_XPRT_RDMA
1800 1810
1801 If unsure, say N. 1811 If unsure, say N.
1802 1812
1803config SUNRPC_BIND34
1804 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
1805 depends on SUNRPC && EXPERIMENTAL
1806 default n
1807 help
1808 RPC requests over IPv6 networks require support for larger
1809 addresses when performing an RPC bind. Sun added support for
1810 IPv6 addressing by creating two new versions of the rpcbind
1811 protocol (RFC 1833).
1812
1813 This option enables support in the kernel RPC client for
1814 querying rpcbind servers via versions 3 and 4 of the rpcbind
1815 protocol. The kernel automatically falls back to version 2
1816 if a remote rpcbind service does not support versions 3 or 4.
1817 By themselves, these new versions do not provide support for
1818 RPC over IPv6, but the new protocol versions are necessary to
1819 support it.
1820
1821 If unsure, say N to get traditional behavior (version 2 rpcbind
1822 requests only).
1823
1824config RPCSEC_GSS_KRB5 1813config RPCSEC_GSS_KRB5
1825 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" 1814 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
1826 depends on SUNRPC && EXPERIMENTAL 1815 depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 277b079dec9e..3b2178b4bb66 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
101obj-$(CONFIG_UFS_FS) += ufs/ 101obj-$(CONFIG_UFS_FS) += ufs/
102obj-$(CONFIG_EFS_FS) += efs/ 102obj-$(CONFIG_EFS_FS) += efs/
103obj-$(CONFIG_JFFS2_FS) += jffs2/ 103obj-$(CONFIG_JFFS2_FS) += jffs2/
104obj-$(CONFIG_UBIFS_FS) += ubifs/
104obj-$(CONFIG_AFFS_FS) += affs/ 105obj-$(CONFIG_AFFS_FS) += affs/
105obj-$(CONFIG_ROMFS_FS) += romfs/ 106obj-$(CONFIG_ROMFS_FS) += romfs/
106obj-$(CONFIG_QNX4FS_FS) += qnx4/ 107obj-$(CONFIG_QNX4FS_FS) += qnx4/
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d92348..c54eaab71a19 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -69,9 +69,11 @@
69#include <linux/capi.h> 69#include <linux/capi.h>
70#include <linux/gigaset_dev.h> 70#include <linux/gigaset_dev.h>
71 71
72#ifdef CONFIG_BLOCK
72#include <scsi/scsi.h> 73#include <scsi/scsi.h>
73#include <scsi/scsi_ioctl.h> 74#include <scsi/scsi_ioctl.h>
74#include <scsi/sg.h> 75#include <scsi/sg.h>
76#endif
75 77
76#include <asm/uaccess.h> 78#include <asm/uaccess.h>
77#include <linux/ethtool.h> 79#include <linux/ethtool.h>
@@ -2024,6 +2026,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
2024COMPATIBLE_IOCTL(PIO_UNISCRNMAP) 2026COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
2025COMPATIBLE_IOCTL(PIO_FONTRESET) 2027COMPATIBLE_IOCTL(PIO_FONTRESET)
2026COMPATIBLE_IOCTL(PIO_UNIMAPCLR) 2028COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
2029#ifdef CONFIG_BLOCK
2027/* Big S */ 2030/* Big S */
2028COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) 2031COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
2029COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) 2032COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -2033,6 +2036,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
2033COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) 2036COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
2034COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) 2037COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
2035COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) 2038COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
2039#endif
2036/* Big T */ 2040/* Big T */
2037COMPATIBLE_IOCTL(TUNSETNOCSUM) 2041COMPATIBLE_IOCTL(TUNSETNOCSUM)
2038COMPATIBLE_IOCTL(TUNSETDEBUG) 2042COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2103,6 +2107,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
2103COMPATIBLE_IOCTL(SIOCSIFVLAN) 2107COMPATIBLE_IOCTL(SIOCSIFVLAN)
2104COMPATIBLE_IOCTL(SIOCBRADDBR) 2108COMPATIBLE_IOCTL(SIOCBRADDBR)
2105COMPATIBLE_IOCTL(SIOCBRDELBR) 2109COMPATIBLE_IOCTL(SIOCBRDELBR)
2110#ifdef CONFIG_BLOCK
2106/* SG stuff */ 2111/* SG stuff */
2107COMPATIBLE_IOCTL(SG_SET_TIMEOUT) 2112COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
2108COMPATIBLE_IOCTL(SG_GET_TIMEOUT) 2113COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2127,6 +2132,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
2127COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) 2132COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
2128COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) 2133COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
2129COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) 2134COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
2135#endif
2130/* PPP stuff */ 2136/* PPP stuff */
2131COMPATIBLE_IOCTL(PPPIOCGFLAGS) 2137COMPATIBLE_IOCTL(PPPIOCGFLAGS)
2132COMPATIBLE_IOCTL(PPPIOCSFLAGS) 2138COMPATIBLE_IOCTL(PPPIOCSFLAGS)
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index cca98609aa7f..da015c12e3ea 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -26,6 +26,7 @@
26 26
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/spinlock.h>
29 30
30struct configfs_dirent { 31struct configfs_dirent {
31 atomic_t s_count; 32 atomic_t s_count;
@@ -47,8 +48,11 @@ struct configfs_dirent {
47#define CONFIGFS_USET_DIR 0x0040 48#define CONFIGFS_USET_DIR 0x0040
48#define CONFIGFS_USET_DEFAULT 0x0080 49#define CONFIGFS_USET_DEFAULT 0x0080
49#define CONFIGFS_USET_DROPPING 0x0100 50#define CONFIGFS_USET_DROPPING 0x0100
51#define CONFIGFS_USET_IN_MKDIR 0x0200
50#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) 52#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
51 53
54extern spinlock_t configfs_dirent_lock;
55
52extern struct vfsmount * configfs_mount; 56extern struct vfsmount * configfs_mount;
53extern struct kmem_cache *configfs_dir_cachep; 57extern struct kmem_cache *configfs_dir_cachep;
54 58
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a48dc7dd8765..0e64312a084c 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -30,11 +30,25 @@
30#include <linux/mount.h> 30#include <linux/mount.h>
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/err.h>
33 34
34#include <linux/configfs.h> 35#include <linux/configfs.h>
35#include "configfs_internal.h" 36#include "configfs_internal.h"
36 37
37DECLARE_RWSEM(configfs_rename_sem); 38DECLARE_RWSEM(configfs_rename_sem);
39/*
40 * Protects mutations of configfs_dirent linkage together with proper i_mutex
41 * Also protects mutations of symlinks linkage to target configfs_dirent
42 * Mutators of configfs_dirent linkage must *both* have the proper inode locked
43 * and configfs_dirent_lock locked, in that order.
44 * This allows one to safely traverse configfs_dirent trees and symlinks without
45 * having to lock inodes.
46 *
47 * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
48 * unlocked is not reliable unless in detach_groups() called from
49 * rmdir()/unregister() and from configfs_attach_group()
50 */
51DEFINE_SPINLOCK(configfs_dirent_lock);
38 52
39static void configfs_d_iput(struct dentry * dentry, 53static void configfs_d_iput(struct dentry * dentry,
40 struct inode * inode) 54 struct inode * inode)
@@ -74,13 +88,20 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
74 88
75 sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); 89 sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
76 if (!sd) 90 if (!sd)
77 return NULL; 91 return ERR_PTR(-ENOMEM);
78 92
79 atomic_set(&sd->s_count, 1); 93 atomic_set(&sd->s_count, 1);
80 INIT_LIST_HEAD(&sd->s_links); 94 INIT_LIST_HEAD(&sd->s_links);
81 INIT_LIST_HEAD(&sd->s_children); 95 INIT_LIST_HEAD(&sd->s_children);
82 list_add(&sd->s_sibling, &parent_sd->s_children);
83 sd->s_element = element; 96 sd->s_element = element;
97 spin_lock(&configfs_dirent_lock);
98 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
99 spin_unlock(&configfs_dirent_lock);
100 kmem_cache_free(configfs_dir_cachep, sd);
101 return ERR_PTR(-ENOENT);
102 }
103 list_add(&sd->s_sibling, &parent_sd->s_children);
104 spin_unlock(&configfs_dirent_lock);
84 105
85 return sd; 106 return sd;
86} 107}
@@ -118,8 +139,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
118 struct configfs_dirent * sd; 139 struct configfs_dirent * sd;
119 140
120 sd = configfs_new_dirent(parent_sd, element); 141 sd = configfs_new_dirent(parent_sd, element);
121 if (!sd) 142 if (IS_ERR(sd))
122 return -ENOMEM; 143 return PTR_ERR(sd);
123 144
124 sd->s_mode = mode; 145 sd->s_mode = mode;
125 sd->s_type = type; 146 sd->s_type = type;
@@ -173,7 +194,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
173 } else { 194 } else {
174 struct configfs_dirent *sd = d->d_fsdata; 195 struct configfs_dirent *sd = d->d_fsdata;
175 if (sd) { 196 if (sd) {
197 spin_lock(&configfs_dirent_lock);
176 list_del_init(&sd->s_sibling); 198 list_del_init(&sd->s_sibling);
199 spin_unlock(&configfs_dirent_lock);
177 configfs_put(sd); 200 configfs_put(sd);
178 } 201 }
179 } 202 }
@@ -224,7 +247,9 @@ int configfs_create_link(struct configfs_symlink *sl,
224 else { 247 else {
225 struct configfs_dirent *sd = dentry->d_fsdata; 248 struct configfs_dirent *sd = dentry->d_fsdata;
226 if (sd) { 249 if (sd) {
250 spin_lock(&configfs_dirent_lock);
227 list_del_init(&sd->s_sibling); 251 list_del_init(&sd->s_sibling);
252 spin_unlock(&configfs_dirent_lock);
228 configfs_put(sd); 253 configfs_put(sd);
229 } 254 }
230 } 255 }
@@ -238,7 +263,9 @@ static void remove_dir(struct dentry * d)
238 struct configfs_dirent * sd; 263 struct configfs_dirent * sd;
239 264
240 sd = d->d_fsdata; 265 sd = d->d_fsdata;
266 spin_lock(&configfs_dirent_lock);
241 list_del_init(&sd->s_sibling); 267 list_del_init(&sd->s_sibling);
268 spin_unlock(&configfs_dirent_lock);
242 configfs_put(sd); 269 configfs_put(sd);
243 if (d->d_inode) 270 if (d->d_inode)
244 simple_rmdir(parent->d_inode,d); 271 simple_rmdir(parent->d_inode,d);
@@ -331,13 +358,13 @@ static struct dentry * configfs_lookup(struct inode *dir,
331 358
332/* 359/*
333 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are 360 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
334 * attributes and are removed by rmdir(). We recurse, taking i_mutex 361 * attributes and are removed by rmdir(). We recurse, setting
335 * on all children that are candidates for default detach. If the 362 * CONFIGFS_USET_DROPPING on all children that are candidates for
336 * result is clean, then configfs_detach_group() will handle dropping 363 * default detach.
337 * i_mutex. If there is an error, the caller will clean up the i_mutex 364 * If there is an error, the caller will reset the flags via
338 * holders via configfs_detach_rollback(). 365 * configfs_detach_rollback().
339 */ 366 */
340static int configfs_detach_prep(struct dentry *dentry) 367static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
341{ 368{
342 struct configfs_dirent *parent_sd = dentry->d_fsdata; 369 struct configfs_dirent *parent_sd = dentry->d_fsdata;
343 struct configfs_dirent *sd; 370 struct configfs_dirent *sd;
@@ -352,15 +379,20 @@ static int configfs_detach_prep(struct dentry *dentry)
352 if (sd->s_type & CONFIGFS_NOT_PINNED) 379 if (sd->s_type & CONFIGFS_NOT_PINNED)
353 continue; 380 continue;
354 if (sd->s_type & CONFIGFS_USET_DEFAULT) { 381 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
355 mutex_lock(&sd->s_dentry->d_inode->i_mutex); 382 /* Abort if racing with mkdir() */
356 /* Mark that we've taken i_mutex */ 383 if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
384 if (wait_mutex)
385 *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
386 return -EAGAIN;
387 }
388 /* Mark that we're trying to drop the group */
357 sd->s_type |= CONFIGFS_USET_DROPPING; 389 sd->s_type |= CONFIGFS_USET_DROPPING;
358 390
359 /* 391 /*
360 * Yup, recursive. If there's a problem, blame 392 * Yup, recursive. If there's a problem, blame
361 * deep nesting of default_groups 393 * deep nesting of default_groups
362 */ 394 */
363 ret = configfs_detach_prep(sd->s_dentry); 395 ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
364 if (!ret) 396 if (!ret)
365 continue; 397 continue;
366 } else 398 } else
@@ -374,7 +406,7 @@ out:
374} 406}
375 407
376/* 408/*
377 * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is 409 * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
378 * set. 410 * set.
379 */ 411 */
380static void configfs_detach_rollback(struct dentry *dentry) 412static void configfs_detach_rollback(struct dentry *dentry)
@@ -385,11 +417,7 @@ static void configfs_detach_rollback(struct dentry *dentry)
385 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 417 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
386 if (sd->s_type & CONFIGFS_USET_DEFAULT) { 418 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
387 configfs_detach_rollback(sd->s_dentry); 419 configfs_detach_rollback(sd->s_dentry);
388 420 sd->s_type &= ~CONFIGFS_USET_DROPPING;
389 if (sd->s_type & CONFIGFS_USET_DROPPING) {
390 sd->s_type &= ~CONFIGFS_USET_DROPPING;
391 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
392 }
393 } 421 }
394 } 422 }
395} 423}
@@ -410,7 +438,9 @@ static void detach_attrs(struct config_item * item)
410 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { 438 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
411 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) 439 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
412 continue; 440 continue;
441 spin_lock(&configfs_dirent_lock);
413 list_del_init(&sd->s_sibling); 442 list_del_init(&sd->s_sibling);
443 spin_unlock(&configfs_dirent_lock);
414 configfs_drop_dentry(sd, dentry); 444 configfs_drop_dentry(sd, dentry);
415 configfs_put(sd); 445 configfs_put(sd);
416 } 446 }
@@ -466,16 +496,12 @@ static void detach_groups(struct config_group *group)
466 496
467 child = sd->s_dentry; 497 child = sd->s_dentry;
468 498
499 mutex_lock(&child->d_inode->i_mutex);
500
469 configfs_detach_group(sd->s_element); 501 configfs_detach_group(sd->s_element);
470 child->d_inode->i_flags |= S_DEAD; 502 child->d_inode->i_flags |= S_DEAD;
471 503
472 /* 504 mutex_unlock(&child->d_inode->i_mutex);
473 * From rmdir/unregister, a configfs_detach_prep() pass
474 * has taken our i_mutex for us. Drop it.
475 * From mkdir/register cleanup, there is no sem held.
476 */
477 if (sd->s_type & CONFIGFS_USET_DROPPING)
478 mutex_unlock(&child->d_inode->i_mutex);
479 505
480 d_delete(child); 506 d_delete(child);
481 dput(child); 507 dput(child);
@@ -1047,25 +1073,24 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1047 group = NULL; 1073 group = NULL;
1048 item = NULL; 1074 item = NULL;
1049 if (type->ct_group_ops->make_group) { 1075 if (type->ct_group_ops->make_group) {
1050 group = type->ct_group_ops->make_group(to_config_group(parent_item), name); 1076 ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group);
1051 if (group) { 1077 if (!ret) {
1052 link_group(to_config_group(parent_item), group); 1078 link_group(to_config_group(parent_item), group);
1053 item = &group->cg_item; 1079 item = &group->cg_item;
1054 } 1080 }
1055 } else { 1081 } else {
1056 item = type->ct_group_ops->make_item(to_config_group(parent_item), name); 1082 ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item);
1057 if (item) 1083 if (!ret)
1058 link_obj(parent_item, item); 1084 link_obj(parent_item, item);
1059 } 1085 }
1060 mutex_unlock(&subsys->su_mutex); 1086 mutex_unlock(&subsys->su_mutex);
1061 1087
1062 kfree(name); 1088 kfree(name);
1063 if (!item) { 1089 if (ret) {
1064 /* 1090 /*
1065 * If item == NULL, then link_obj() was never called. 1091 * If ret != 0, then link_obj() was never called.
1066 * There are no extra references to clean up. 1092 * There are no extra references to clean up.
1067 */ 1093 */
1068 ret = -ENOMEM;
1069 goto out_put; 1094 goto out_put;
1070 } 1095 }
1071 1096
@@ -1093,11 +1118,26 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1093 */ 1118 */
1094 module_got = 1; 1119 module_got = 1;
1095 1120
1121 /*
1122 * Make racing rmdir() fail if it did not tag parent with
1123 * CONFIGFS_USET_DROPPING
1124 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
1125 * fail and let rmdir() terminate correctly
1126 */
1127 spin_lock(&configfs_dirent_lock);
1128 /* This will make configfs_detach_prep() fail */
1129 sd->s_type |= CONFIGFS_USET_IN_MKDIR;
1130 spin_unlock(&configfs_dirent_lock);
1131
1096 if (group) 1132 if (group)
1097 ret = configfs_attach_group(parent_item, item, dentry); 1133 ret = configfs_attach_group(parent_item, item, dentry);
1098 else 1134 else
1099 ret = configfs_attach_item(parent_item, item, dentry); 1135 ret = configfs_attach_item(parent_item, item, dentry);
1100 1136
1137 spin_lock(&configfs_dirent_lock);
1138 sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
1139 spin_unlock(&configfs_dirent_lock);
1140
1101out_unlink: 1141out_unlink:
1102 if (ret) { 1142 if (ret) {
1103 /* Tear down everything we built up */ 1143 /* Tear down everything we built up */
@@ -1161,12 +1201,27 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1161 return -EINVAL; 1201 return -EINVAL;
1162 } 1202 }
1163 1203
1164 ret = configfs_detach_prep(dentry); 1204 spin_lock(&configfs_dirent_lock);
1165 if (ret) { 1205 do {
1166 configfs_detach_rollback(dentry); 1206 struct mutex *wait_mutex;
1167 config_item_put(parent_item); 1207
1168 return ret; 1208 ret = configfs_detach_prep(dentry, &wait_mutex);
1169 } 1209 if (ret) {
1210 configfs_detach_rollback(dentry);
1211 spin_unlock(&configfs_dirent_lock);
1212 if (ret != -EAGAIN) {
1213 config_item_put(parent_item);
1214 return ret;
1215 }
1216
1217 /* Wait until the racing operation terminates */
1218 mutex_lock(wait_mutex);
1219 mutex_unlock(wait_mutex);
1220
1221 spin_lock(&configfs_dirent_lock);
1222 }
1223 } while (ret == -EAGAIN);
1224 spin_unlock(&configfs_dirent_lock);
1170 1225
1171 /* Get a working ref for the duration of this function */ 1226 /* Get a working ref for the duration of this function */
1172 item = configfs_get_config_item(dentry); 1227 item = configfs_get_config_item(dentry);
@@ -1258,7 +1313,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
1258 file->private_data = configfs_new_dirent(parent_sd, NULL); 1313 file->private_data = configfs_new_dirent(parent_sd, NULL);
1259 mutex_unlock(&dentry->d_inode->i_mutex); 1314 mutex_unlock(&dentry->d_inode->i_mutex);
1260 1315
1261 return file->private_data ? 0 : -ENOMEM; 1316 return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
1262 1317
1263} 1318}
1264 1319
@@ -1268,7 +1323,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
1268 struct configfs_dirent * cursor = file->private_data; 1323 struct configfs_dirent * cursor = file->private_data;
1269 1324
1270 mutex_lock(&dentry->d_inode->i_mutex); 1325 mutex_lock(&dentry->d_inode->i_mutex);
1326 spin_lock(&configfs_dirent_lock);
1271 list_del_init(&cursor->s_sibling); 1327 list_del_init(&cursor->s_sibling);
1328 spin_unlock(&configfs_dirent_lock);
1272 mutex_unlock(&dentry->d_inode->i_mutex); 1329 mutex_unlock(&dentry->d_inode->i_mutex);
1273 1330
1274 release_configfs_dirent(cursor); 1331 release_configfs_dirent(cursor);
@@ -1308,7 +1365,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1308 /* fallthrough */ 1365 /* fallthrough */
1309 default: 1366 default:
1310 if (filp->f_pos == 2) { 1367 if (filp->f_pos == 2) {
1368 spin_lock(&configfs_dirent_lock);
1311 list_move(q, &parent_sd->s_children); 1369 list_move(q, &parent_sd->s_children);
1370 spin_unlock(&configfs_dirent_lock);
1312 } 1371 }
1313 for (p=q->next; p!= &parent_sd->s_children; p=p->next) { 1372 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
1314 struct configfs_dirent *next; 1373 struct configfs_dirent *next;
@@ -1331,7 +1390,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1331 dt_type(next)) < 0) 1390 dt_type(next)) < 0)
1332 return 0; 1391 return 0;
1333 1392
1393 spin_lock(&configfs_dirent_lock);
1334 list_move(q, p); 1394 list_move(q, p);
1395 spin_unlock(&configfs_dirent_lock);
1335 p = q; 1396 p = q;
1336 filp->f_pos++; 1397 filp->f_pos++;
1337 } 1398 }
@@ -1362,6 +1423,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
1362 struct list_head *p; 1423 struct list_head *p;
1363 loff_t n = file->f_pos - 2; 1424 loff_t n = file->f_pos - 2;
1364 1425
1426 spin_lock(&configfs_dirent_lock);
1365 list_del(&cursor->s_sibling); 1427 list_del(&cursor->s_sibling);
1366 p = sd->s_children.next; 1428 p = sd->s_children.next;
1367 while (n && p != &sd->s_children) { 1429 while (n && p != &sd->s_children) {
@@ -1373,6 +1435,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
1373 p = p->next; 1435 p = p->next;
1374 } 1436 }
1375 list_add_tail(&cursor->s_sibling, p); 1437 list_add_tail(&cursor->s_sibling, p);
1438 spin_unlock(&configfs_dirent_lock);
1376 } 1439 }
1377 } 1440 }
1378 mutex_unlock(&dentry->d_inode->i_mutex); 1441 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1448,9 +1511,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1448 mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, 1511 mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
1449 I_MUTEX_PARENT); 1512 I_MUTEX_PARENT);
1450 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 1513 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
1451 if (configfs_detach_prep(dentry)) { 1514 spin_lock(&configfs_dirent_lock);
1515 if (configfs_detach_prep(dentry, NULL)) {
1452 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); 1516 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
1453 } 1517 }
1518 spin_unlock(&configfs_dirent_lock);
1454 configfs_detach_group(&group->cg_item); 1519 configfs_detach_group(&group->cg_item);
1455 dentry->d_inode->i_flags |= S_DEAD; 1520 dentry->d_inode->i_flags |= S_DEAD;
1456 mutex_unlock(&dentry->d_inode->i_mutex); 1521 mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index b9a1d810346d..4803ccc94480 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
247 if (!sd->s_element) 247 if (!sd->s_element)
248 continue; 248 continue;
249 if (!strcmp(configfs_get_name(sd), name)) { 249 if (!strcmp(configfs_get_name(sd), name)) {
250 spin_lock(&configfs_dirent_lock);
250 list_del_init(&sd->s_sibling); 251 list_del_init(&sd->s_sibling);
252 spin_unlock(&configfs_dirent_lock);
251 configfs_drop_dentry(sd, dir); 253 configfs_drop_dentry(sd, dir);
252 configfs_put(sd); 254 configfs_put(sd);
253 break; 255 break;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 2a731ef5f305..0004d18c40ac 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -77,12 +77,15 @@ static int create_link(struct config_item *parent_item,
77 sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); 77 sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
78 if (sl) { 78 if (sl) {
79 sl->sl_target = config_item_get(item); 79 sl->sl_target = config_item_get(item);
80 /* FIXME: needs a lock, I'd bet */ 80 spin_lock(&configfs_dirent_lock);
81 list_add(&sl->sl_list, &target_sd->s_links); 81 list_add(&sl->sl_list, &target_sd->s_links);
82 spin_unlock(&configfs_dirent_lock);
82 ret = configfs_create_link(sl, parent_item->ci_dentry, 83 ret = configfs_create_link(sl, parent_item->ci_dentry,
83 dentry); 84 dentry);
84 if (ret) { 85 if (ret) {
86 spin_lock(&configfs_dirent_lock);
85 list_del_init(&sl->sl_list); 87 list_del_init(&sl->sl_list);
88 spin_unlock(&configfs_dirent_lock);
86 config_item_put(item); 89 config_item_put(item);
87 kfree(sl); 90 kfree(sl);
88 } 91 }
@@ -137,8 +140,12 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
137 goto out_put; 140 goto out_put;
138 141
139 ret = type->ct_item_ops->allow_link(parent_item, target_item); 142 ret = type->ct_item_ops->allow_link(parent_item, target_item);
140 if (!ret) 143 if (!ret) {
141 ret = create_link(parent_item, target_item, dentry); 144 ret = create_link(parent_item, target_item, dentry);
145 if (ret && type->ct_item_ops->drop_link)
146 type->ct_item_ops->drop_link(parent_item,
147 target_item);
148 }
142 149
143 config_item_put(target_item); 150 config_item_put(target_item);
144 path_put(&nd.path); 151 path_put(&nd.path);
@@ -169,7 +176,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
169 parent_item = configfs_get_config_item(dentry->d_parent); 176 parent_item = configfs_get_config_item(dentry->d_parent);
170 type = parent_item->ci_type; 177 type = parent_item->ci_type;
171 178
179 spin_lock(&configfs_dirent_lock);
172 list_del_init(&sd->s_sibling); 180 list_del_init(&sd->s_sibling);
181 spin_unlock(&configfs_dirent_lock);
173 configfs_drop_dentry(sd, dentry->d_parent); 182 configfs_drop_dentry(sd, dentry->d_parent);
174 dput(dentry); 183 dput(dentry);
175 configfs_put(sd); 184 configfs_put(sd);
@@ -184,8 +193,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
184 type->ct_item_ops->drop_link(parent_item, 193 type->ct_item_ops->drop_link(parent_item,
185 sl->sl_target); 194 sl->sl_target);
186 195
187 /* FIXME: Needs lock */ 196 spin_lock(&configfs_dirent_lock);
188 list_del_init(&sl->sl_list); 197 list_del_init(&sl->sl_list);
198 spin_unlock(&configfs_dirent_lock);
189 199
190 /* Put reference from create_link() */ 200 /* Put reference from create_link() */
191 config_item_put(sl->sl_target); 201 config_item_put(sl->sl_target);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac23bd288b2..492d8caaaf25 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -41,16 +41,20 @@ struct comm;
41struct nodes; 41struct nodes;
42struct node; 42struct node;
43 43
44static struct config_group *make_cluster(struct config_group *, const char *); 44static int make_cluster(struct config_group *, const char *,
45 struct config_group **);
45static void drop_cluster(struct config_group *, struct config_item *); 46static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *); 47static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *); 48static int make_space(struct config_group *, const char *,
49 struct config_group **);
48static void drop_space(struct config_group *, struct config_item *); 50static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *); 51static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *); 52static int make_comm(struct config_group *, const char *,
53 struct config_item **);
51static void drop_comm(struct config_group *, struct config_item *); 54static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *); 55static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *); 56static int make_node(struct config_group *, const char *,
57 struct config_item **);
54static void drop_node(struct config_group *, struct config_item *); 58static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *); 59static void release_node(struct config_item *);
56 60
@@ -392,8 +396,8 @@ static struct node *to_node(struct config_item *i)
392 return i ? container_of(i, struct node, item) : NULL; 396 return i ? container_of(i, struct node, item) : NULL;
393} 397}
394 398
395static struct config_group *make_cluster(struct config_group *g, 399static int make_cluster(struct config_group *g, const char *name,
396 const char *name) 400 struct config_group **new_g)
397{ 401{
398 struct cluster *cl = NULL; 402 struct cluster *cl = NULL;
399 struct spaces *sps = NULL; 403 struct spaces *sps = NULL;
@@ -431,14 +435,15 @@ static struct config_group *make_cluster(struct config_group *g,
431 435
432 space_list = &sps->ss_group; 436 space_list = &sps->ss_group;
433 comm_list = &cms->cs_group; 437 comm_list = &cms->cs_group;
434 return &cl->group; 438 *new_g = &cl->group;
439 return 0;
435 440
436 fail: 441 fail:
437 kfree(cl); 442 kfree(cl);
438 kfree(gps); 443 kfree(gps);
439 kfree(sps); 444 kfree(sps);
440 kfree(cms); 445 kfree(cms);
441 return NULL; 446 return -ENOMEM;
442} 447}
443 448
444static void drop_cluster(struct config_group *g, struct config_item *i) 449static void drop_cluster(struct config_group *g, struct config_item *i)
@@ -466,7 +471,8 @@ static void release_cluster(struct config_item *i)
466 kfree(cl); 471 kfree(cl);
467} 472}
468 473
469static struct config_group *make_space(struct config_group *g, const char *name) 474static int make_space(struct config_group *g, const char *name,
475 struct config_group **new_g)
470{ 476{
471 struct space *sp = NULL; 477 struct space *sp = NULL;
472 struct nodes *nds = NULL; 478 struct nodes *nds = NULL;
@@ -489,13 +495,14 @@ static struct config_group *make_space(struct config_group *g, const char *name)
489 INIT_LIST_HEAD(&sp->members); 495 INIT_LIST_HEAD(&sp->members);
490 mutex_init(&sp->members_lock); 496 mutex_init(&sp->members_lock);
491 sp->members_count = 0; 497 sp->members_count = 0;
492 return &sp->group; 498 *new_g = &sp->group;
499 return 0;
493 500
494 fail: 501 fail:
495 kfree(sp); 502 kfree(sp);
496 kfree(gps); 503 kfree(gps);
497 kfree(nds); 504 kfree(nds);
498 return NULL; 505 return -ENOMEM;
499} 506}
500 507
501static void drop_space(struct config_group *g, struct config_item *i) 508static void drop_space(struct config_group *g, struct config_item *i)
@@ -522,19 +529,21 @@ static void release_space(struct config_item *i)
522 kfree(sp); 529 kfree(sp);
523} 530}
524 531
525static struct config_item *make_comm(struct config_group *g, const char *name) 532static int make_comm(struct config_group *g, const char *name,
533 struct config_item **new_i)
526{ 534{
527 struct comm *cm; 535 struct comm *cm;
528 536
529 cm = kzalloc(sizeof(struct comm), GFP_KERNEL); 537 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
530 if (!cm) 538 if (!cm)
531 return NULL; 539 return -ENOMEM;
532 540
533 config_item_init_type_name(&cm->item, name, &comm_type); 541 config_item_init_type_name(&cm->item, name, &comm_type);
534 cm->nodeid = -1; 542 cm->nodeid = -1;
535 cm->local = 0; 543 cm->local = 0;
536 cm->addr_count = 0; 544 cm->addr_count = 0;
537 return &cm->item; 545 *new_i = &cm->item;
546 return 0;
538} 547}
539 548
540static void drop_comm(struct config_group *g, struct config_item *i) 549static void drop_comm(struct config_group *g, struct config_item *i)
@@ -554,14 +563,15 @@ static void release_comm(struct config_item *i)
554 kfree(cm); 563 kfree(cm);
555} 564}
556 565
557static struct config_item *make_node(struct config_group *g, const char *name) 566static int make_node(struct config_group *g, const char *name,
567 struct config_item **new_i)
558{ 568{
559 struct space *sp = to_space(g->cg_item.ci_parent); 569 struct space *sp = to_space(g->cg_item.ci_parent);
560 struct node *nd; 570 struct node *nd;
561 571
562 nd = kzalloc(sizeof(struct node), GFP_KERNEL); 572 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
563 if (!nd) 573 if (!nd)
564 return NULL; 574 return -ENOMEM;
565 575
566 config_item_init_type_name(&nd->item, name, &node_type); 576 config_item_init_type_name(&nd->item, name, &node_type);
567 nd->nodeid = -1; 577 nd->nodeid = -1;
@@ -573,7 +583,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
573 sp->members_count++; 583 sp->members_count++;
574 mutex_unlock(&sp->members_lock); 584 mutex_unlock(&sp->members_lock);
575 585
576 return &nd->item; 586 *new_i = &nd->item;
587 return 0;
577} 588}
578 589
579static void drop_node(struct config_group *g, struct config_item *i) 590static void drop_node(struct config_group *g, struct config_item *i)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae45f77765c0..25adfc3c693a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -424,8 +424,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
424 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so 424 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
425 * that it can be located for waiting on in __writeback_single_inode(). 425 * that it can be located for waiting on in __writeback_single_inode().
426 * 426 *
427 * Called under inode_lock.
428 *
429 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 427 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
430 * This function assumes that the blockdev superblock's inodes are backed by 428 * This function assumes that the blockdev superblock's inodes are backed by
431 * a variety of queues, so all inodes are searched. For other superblocks, 429 * a variety of queues, so all inodes are searched. For other superblocks,
@@ -441,11 +439,12 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
441 * on the writer throttling path, and we get decent balancing between many 439 * on the writer throttling path, and we get decent balancing between many
442 * throttled threads: we don't want them all piling up on inode_sync_wait. 440 * throttled threads: we don't want them all piling up on inode_sync_wait.
443 */ 441 */
444static void 442void generic_sync_sb_inodes(struct super_block *sb,
445sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) 443 struct writeback_control *wbc)
446{ 444{
447 const unsigned long start = jiffies; /* livelock avoidance */ 445 const unsigned long start = jiffies; /* livelock avoidance */
448 446
447 spin_lock(&inode_lock);
449 if (!wbc->for_kupdate || list_empty(&sb->s_io)) 448 if (!wbc->for_kupdate || list_empty(&sb->s_io))
450 queue_io(sb, wbc->older_than_this); 449 queue_io(sb, wbc->older_than_this);
451 450
@@ -524,8 +523,16 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
524 if (!list_empty(&sb->s_more_io)) 523 if (!list_empty(&sb->s_more_io))
525 wbc->more_io = 1; 524 wbc->more_io = 1;
526 } 525 }
526 spin_unlock(&inode_lock);
527 return; /* Leave any unwritten inodes on s_io */ 527 return; /* Leave any unwritten inodes on s_io */
528} 528}
529EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
530
531static void sync_sb_inodes(struct super_block *sb,
532 struct writeback_control *wbc)
533{
534 generic_sync_sb_inodes(sb, wbc);
535}
529 536
530/* 537/*
531 * Start writeback of dirty pagecache data against all unlocked inodes. 538 * Start writeback of dirty pagecache data against all unlocked inodes.
@@ -565,11 +572,8 @@ restart:
565 * be unmounted by the time it is released. 572 * be unmounted by the time it is released.
566 */ 573 */
567 if (down_read_trylock(&sb->s_umount)) { 574 if (down_read_trylock(&sb->s_umount)) {
568 if (sb->s_root) { 575 if (sb->s_root)
569 spin_lock(&inode_lock);
570 sync_sb_inodes(sb, wbc); 576 sync_sb_inodes(sb, wbc);
571 spin_unlock(&inode_lock);
572 }
573 up_read(&sb->s_umount); 577 up_read(&sb->s_umount);
574 } 578 }
575 spin_lock(&sb_lock); 579 spin_lock(&sb_lock);
@@ -607,9 +611,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
607 (inodes_stat.nr_inodes - inodes_stat.nr_unused) + 611 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
608 nr_dirty + nr_unstable; 612 nr_dirty + nr_unstable;
609 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ 613 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
610 spin_lock(&inode_lock);
611 sync_sb_inodes(sb, &wbc); 614 sync_sb_inodes(sb, &wbc);
612 spin_unlock(&inode_lock);
613} 615}
614 616
615/* 617/*
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3f..1f6dc518505c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
224 224
225static void nlmclnt_rpc_release(void *data) 225static void nlmclnt_rpc_release(void *data)
226{ 226{
227 lock_kernel();
227 nlm_release_call(data); 228 nlm_release_call(data);
229 unlock_kernel();
228} 230}
229 231
230static int nlm_wait_on_grace(wait_queue_head_t *queue) 232static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
430 * Report the conflicting lock back to the application. 432 * Report the conflicting lock back to the application.
431 */ 433 */
432 fl->fl_start = req->a_res.lock.fl.fl_start; 434 fl->fl_start = req->a_res.lock.fl.fl_start;
433 fl->fl_end = req->a_res.lock.fl.fl_start; 435 fl->fl_end = req->a_res.lock.fl.fl_end;
434 fl->fl_type = req->a_res.lock.fl.fl_type; 436 fl->fl_type = req->a_res.lock.fl.fl_type;
435 fl->fl_pid = 0; 437 fl->fl_pid = 0;
436 break; 438 break;
@@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
710die: 712die:
711 return; 713 return;
712 retry_rebind: 714 retry_rebind:
715 lock_kernel();
713 nlm_rebind_host(req->a_host); 716 nlm_rebind_host(req->a_host);
717 unlock_kernel();
714 retry_unlock: 718 retry_unlock:
715 rpc_restart_call(task); 719 rpc_restart_call(task);
716} 720}
@@ -788,7 +792,9 @@ retry_cancel:
788 /* Don't ever retry more than 3 times */ 792 /* Don't ever retry more than 3 times */
789 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) 793 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
790 goto die; 794 goto die;
795 lock_kernel();
791 nlm_rebind_host(req->a_host); 796 nlm_rebind_host(req->a_host);
797 unlock_kernel();
792 rpc_restart_call(task); 798 rpc_restart_call(task);
793 rpc_delay(task, 30 * HZ); 799 rpc_delay(task, 30 * HZ);
794} 800}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387d..2e27176ff42f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
248 248
249static void nlm4svc_callback_release(void *data) 249static void nlm4svc_callback_release(void *data)
250{ 250{
251 lock_kernel();
251 nlm_release_call(data); 252 nlm_release_call(data);
253 unlock_kernel();
252} 254}
253 255
254static const struct rpc_call_ops nlm4svc_callback_ops = { 256static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfde..56a08ab9a4cb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
795 795
796 dprintk("lockd: GRANT_MSG RPC callback\n"); 796 dprintk("lockd: GRANT_MSG RPC callback\n");
797 797
798 lock_kernel();
798 /* if the block is not on a list at this point then it has 799 /* if the block is not on a list at this point then it has
799 * been invalidated. Don't try to requeue it. 800 * been invalidated. Don't try to requeue it.
800 * 801 *
@@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
804 * for nlm_blocked? 805 * for nlm_blocked?
805 */ 806 */
806 if (list_empty(&block->b_list)) 807 if (list_empty(&block->b_list))
807 return; 808 goto out;
808 809
809 /* Technically, we should down the file semaphore here. Since we 810 /* Technically, we should down the file semaphore here. Since we
810 * move the block towards the head of the queue only, no harm 811 * move the block towards the head of the queue only, no harm
@@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
818 } 819 }
819 nlmsvc_insert_block(block, timeout); 820 nlmsvc_insert_block(block, timeout);
820 svc_wake_up(block->b_daemon); 821 svc_wake_up(block->b_daemon);
822out:
823 unlock_kernel();
821} 824}
822 825
823static void nlmsvc_grant_release(void *data) 826static void nlmsvc_grant_release(void *data)
824{ 827{
825 struct nlm_rqst *call = data; 828 struct nlm_rqst *call = data;
826 829
830 lock_kernel();
827 nlmsvc_release_block(call->a_block); 831 nlmsvc_release_block(call->a_block);
832 unlock_kernel();
828} 833}
829 834
830static const struct rpc_call_ops nlmsvc_grant_ops = { 835static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b1..ce6952b50a75 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
278 278
279static void nlmsvc_callback_release(void *data) 279static void nlmsvc_callback_release(void *data)
280{ 280{
281 lock_kernel();
281 nlm_release_call(data); 282 nlm_release_call(data);
283 unlock_kernel();
282} 284}
283 285
284static const struct rpc_call_ops nlmsvc_callback_ops = { 286static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c8300629..f447f4b4476c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
27 27
28struct nfs_callback_data { 28struct nfs_callback_data {
29 unsigned int users; 29 unsigned int users;
30 struct svc_serv *serv; 30 struct svc_rqst *rqst;
31 struct task_struct *task; 31 struct task_struct *task;
32}; 32};
33 33
@@ -91,21 +91,17 @@ nfs_callback_svc(void *vrqstp)
91 svc_process(rqstp); 91 svc_process(rqstp);
92 } 92 }
93 unlock_kernel(); 93 unlock_kernel();
94 nfs_callback_info.task = NULL;
95 svc_exit_thread(rqstp);
96 return 0; 94 return 0;
97} 95}
98 96
99/* 97/*
100 * Bring up the server process if it is not already up. 98 * Bring up the callback thread if it is not already up.
101 */ 99 */
102int nfs_callback_up(void) 100int nfs_callback_up(void)
103{ 101{
104 struct svc_serv *serv = NULL; 102 struct svc_serv *serv = NULL;
105 struct svc_rqst *rqstp;
106 int ret = 0; 103 int ret = 0;
107 104
108 lock_kernel();
109 mutex_lock(&nfs_callback_mutex); 105 mutex_lock(&nfs_callback_mutex);
110 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 106 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
111 goto out; 107 goto out;
@@ -121,22 +117,23 @@ int nfs_callback_up(void)
121 nfs_callback_tcpport = ret; 117 nfs_callback_tcpport = ret;
122 dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); 118 dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
123 119
124 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); 120 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
125 if (IS_ERR(rqstp)) { 121 if (IS_ERR(nfs_callback_info.rqst)) {
126 ret = PTR_ERR(rqstp); 122 ret = PTR_ERR(nfs_callback_info.rqst);
123 nfs_callback_info.rqst = NULL;
127 goto out_err; 124 goto out_err;
128 } 125 }
129 126
130 svc_sock_update_bufs(serv); 127 svc_sock_update_bufs(serv);
131 nfs_callback_info.serv = serv;
132 128
133 nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp, 129 nfs_callback_info.task = kthread_run(nfs_callback_svc,
130 nfs_callback_info.rqst,
134 "nfsv4-svc"); 131 "nfsv4-svc");
135 if (IS_ERR(nfs_callback_info.task)) { 132 if (IS_ERR(nfs_callback_info.task)) {
136 ret = PTR_ERR(nfs_callback_info.task); 133 ret = PTR_ERR(nfs_callback_info.task);
137 nfs_callback_info.serv = NULL; 134 svc_exit_thread(nfs_callback_info.rqst);
135 nfs_callback_info.rqst = NULL;
138 nfs_callback_info.task = NULL; 136 nfs_callback_info.task = NULL;
139 svc_exit_thread(rqstp);
140 goto out_err; 137 goto out_err;
141 } 138 }
142out: 139out:
@@ -149,7 +146,6 @@ out:
149 if (serv) 146 if (serv)
150 svc_destroy(serv); 147 svc_destroy(serv);
151 mutex_unlock(&nfs_callback_mutex); 148 mutex_unlock(&nfs_callback_mutex);
152 unlock_kernel();
153 return ret; 149 return ret;
154out_err: 150out_err:
155 dprintk("Couldn't create callback socket or server thread; err = %d\n", 151 dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -159,17 +155,19 @@ out_err:
159} 155}
160 156
161/* 157/*
162 * Kill the server process if it is not already down. 158 * Kill the callback thread if it's no longer being used.
163 */ 159 */
164void nfs_callback_down(void) 160void nfs_callback_down(void)
165{ 161{
166 lock_kernel();
167 mutex_lock(&nfs_callback_mutex); 162 mutex_lock(&nfs_callback_mutex);
168 nfs_callback_info.users--; 163 nfs_callback_info.users--;
169 if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) 164 if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
170 kthread_stop(nfs_callback_info.task); 165 kthread_stop(nfs_callback_info.task);
166 svc_exit_thread(nfs_callback_info.rqst);
167 nfs_callback_info.rqst = NULL;
168 nfs_callback_info.task = NULL;
169 }
171 mutex_unlock(&nfs_callback_mutex); 170 mutex_unlock(&nfs_callback_mutex);
172 unlock_kernel();
173} 171}
174 172
175static int nfs_callback_authenticate(struct svc_rqst *rqstp) 173static int nfs_callback_authenticate(struct svc_rqst *rqstp)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b5..5ee23e7058b3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
431{ 431{
432 to->to_initval = timeo * HZ / 10; 432 to->to_initval = timeo * HZ / 10;
433 to->to_retries = retrans; 433 to->to_retries = retrans;
434 if (!to->to_retries)
435 to->to_retries = 2;
436 434
437 switch (proto) { 435 switch (proto) {
438 case XPRT_TRANSPORT_TCP: 436 case XPRT_TRANSPORT_TCP:
439 case XPRT_TRANSPORT_RDMA: 437 case XPRT_TRANSPORT_RDMA:
438 if (to->to_retries == 0)
439 to->to_retries = NFS_DEF_TCP_RETRANS;
440 if (to->to_initval == 0) 440 if (to->to_initval == 0)
441 to->to_initval = 60 * HZ; 441 to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
442 if (to->to_initval > NFS_MAX_TCP_TIMEOUT) 442 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
443 to->to_initval = NFS_MAX_TCP_TIMEOUT; 443 to->to_initval = NFS_MAX_TCP_TIMEOUT;
444 to->to_increment = to->to_initval; 444 to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
450 to->to_exponential = 0; 450 to->to_exponential = 0;
451 break; 451 break;
452 case XPRT_TRANSPORT_UDP: 452 case XPRT_TRANSPORT_UDP:
453 default: 453 if (to->to_retries == 0)
454 to->to_retries = NFS_DEF_UDP_RETRANS;
454 if (!to->to_initval) 455 if (!to->to_initval)
455 to->to_initval = 11 * HZ / 10; 456 to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
456 if (to->to_initval > NFS_MAX_UDP_TIMEOUT) 457 if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
457 to->to_initval = NFS_MAX_UDP_TIMEOUT; 458 to->to_initval = NFS_MAX_UDP_TIMEOUT;
458 to->to_maxval = NFS_MAX_UDP_TIMEOUT; 459 to->to_maxval = NFS_MAX_UDP_TIMEOUT;
459 to->to_exponential = 1; 460 to->to_exponential = 1;
460 break; 461 break;
462 default:
463 BUG();
461 } 464 }
462} 465}
463 466
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 982a2064fe4c..28a238dab23a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp)
133{ 133{
134 int res; 134 int res;
135 135
136 dfprintk(VFS, "NFS: opendir(%s/%ld)\n", 136 dfprintk(FILE, "NFS: open dir(%s/%s)\n",
137 inode->i_sb->s_id, inode->i_ino); 137 filp->f_path.dentry->d_parent->d_name.name,
138 filp->f_path.dentry->d_name.name);
139
140 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
138 141
139 lock_kernel();
140 /* Call generic open code in order to cache credentials */ 142 /* Call generic open code in order to cache credentials */
141 res = nfs_open(inode, filp); 143 res = nfs_open(inode, filp);
142 unlock_kernel();
143 return res; 144 return res;
144} 145}
145 146
@@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
528 struct nfs_fattr fattr; 529 struct nfs_fattr fattr;
529 long res; 530 long res;
530 531
531 dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n", 532 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
532 dentry->d_parent->d_name.name, dentry->d_name.name, 533 dentry->d_parent->d_name.name, dentry->d_name.name,
533 (long long)filp->f_pos); 534 (long long)filp->f_pos);
534 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); 535 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
535 536
536 lock_kernel();
537
538 /* 537 /*
539 * filp->f_pos points to the dirent entry number. 538 * filp->f_pos points to the dirent entry number.
540 * *desc->dir_cookie has the cookie for the next entry. We have 539 * *desc->dir_cookie has the cookie for the next entry. We have
@@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
592 } 591 }
593out: 592out:
594 nfs_unblock_sillyrename(dentry); 593 nfs_unblock_sillyrename(dentry);
595 unlock_kernel();
596 if (res > 0) 594 if (res > 0)
597 res = 0; 595 res = 0;
598 dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n", 596 dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
599 dentry->d_parent->d_name.name, dentry->d_name.name, 597 dentry->d_parent->d_name.name, dentry->d_name.name,
600 res); 598 res);
601 return res; 599 return res;
@@ -603,7 +601,15 @@ out:
603 601
604static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) 602static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
605{ 603{
606 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); 604 struct dentry *dentry = filp->f_path.dentry;
605 struct inode *inode = dentry->d_inode;
606
607 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
608 dentry->d_parent->d_name.name,
609 dentry->d_name.name,
610 offset, origin);
611
612 mutex_lock(&inode->i_mutex);
607 switch (origin) { 613 switch (origin) {
608 case 1: 614 case 1:
609 offset += filp->f_pos; 615 offset += filp->f_pos;
@@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
619 nfs_file_open_context(filp)->dir_cookie = 0; 625 nfs_file_open_context(filp)->dir_cookie = 0;
620 } 626 }
621out: 627out:
622 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 628 mutex_unlock(&inode->i_mutex);
623 return offset; 629 return offset;
624} 630}
625 631
@@ -629,10 +635,11 @@ out:
629 */ 635 */
630static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) 636static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
631{ 637{
632 dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n", 638 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
633 dentry->d_parent->d_name.name, dentry->d_name.name, 639 dentry->d_parent->d_name.name, dentry->d_name.name,
634 datasync); 640 datasync);
635 641
642 nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
636 return 0; 643 return 0;
637} 644}
638 645
@@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
767 struct nfs_fattr fattr; 774 struct nfs_fattr fattr;
768 775
769 parent = dget_parent(dentry); 776 parent = dget_parent(dentry);
770 lock_kernel();
771 dir = parent->d_inode; 777 dir = parent->d_inode;
772 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 778 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
773 inode = dentry->d_inode; 779 inode = dentry->d_inode;
@@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
805 811
806 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 812 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
807 out_valid: 813 out_valid:
808 unlock_kernel();
809 dput(parent); 814 dput(parent);
810 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", 815 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
811 __func__, dentry->d_parent->d_name.name, 816 __func__, dentry->d_parent->d_name.name,
@@ -824,7 +829,6 @@ out_zap_parent:
824 shrink_dcache_parent(dentry); 829 shrink_dcache_parent(dentry);
825 } 830 }
826 d_drop(dentry); 831 d_drop(dentry);
827 unlock_kernel();
828 dput(parent); 832 dput(parent);
829 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 833 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
830 __func__, dentry->d_parent->d_name.name, 834 __func__, dentry->d_parent->d_name.name,
@@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
858 862
859} 863}
860 864
865static void nfs_drop_nlink(struct inode *inode)
866{
867 spin_lock(&inode->i_lock);
868 if (inode->i_nlink > 0)
869 drop_nlink(inode);
870 spin_unlock(&inode->i_lock);
871}
872
861/* 873/*
862 * Called when the dentry loses inode. 874 * Called when the dentry loses inode.
863 * We use it to clean up silly-renamed files. 875 * We use it to clean up silly-renamed files.
@@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
869 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 881 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
870 882
871 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 883 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
872 lock_kernel();
873 drop_nlink(inode); 884 drop_nlink(inode);
874 nfs_complete_unlink(dentry, inode); 885 nfs_complete_unlink(dentry, inode);
875 unlock_kernel();
876 } 886 }
877 iput(inode); 887 iput(inode);
878} 888}
@@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
903 res = ERR_PTR(-ENOMEM); 913 res = ERR_PTR(-ENOMEM);
904 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 914 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
905 915
906 lock_kernel();
907
908 /* 916 /*
909 * If we're doing an exclusive create, optimize away the lookup 917 * If we're doing an exclusive create, optimize away the lookup
910 * but don't hash the dentry. 918 * but don't hash the dentry.
@@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
912 if (nfs_is_exclusive_create(dir, nd)) { 920 if (nfs_is_exclusive_create(dir, nd)) {
913 d_instantiate(dentry, NULL); 921 d_instantiate(dentry, NULL);
914 res = NULL; 922 res = NULL;
915 goto out_unlock; 923 goto out;
916 } 924 }
917 925
918 parent = dentry->d_parent; 926 parent = dentry->d_parent;
@@ -940,8 +948,6 @@ no_entry:
940 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 948 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
941out_unblock_sillyrename: 949out_unblock_sillyrename:
942 nfs_unblock_sillyrename(parent); 950 nfs_unblock_sillyrename(parent);
943out_unlock:
944 unlock_kernel();
945out: 951out:
946 return res; 952 return res;
947} 953}
@@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
999 } 1005 }
1000 1006
1001 /* Open the file on the server */ 1007 /* Open the file on the server */
1002 lock_kernel();
1003 res = nfs4_atomic_open(dir, dentry, nd); 1008 res = nfs4_atomic_open(dir, dentry, nd);
1004 unlock_kernel();
1005 if (IS_ERR(res)) { 1009 if (IS_ERR(res)) {
1006 error = PTR_ERR(res); 1010 error = PTR_ERR(res);
1007 switch (error) { 1011 switch (error) {
@@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1063 * operations that change the directory. We therefore save the 1067 * operations that change the directory. We therefore save the
1064 * change attribute *before* we do the RPC call. 1068 * change attribute *before* we do the RPC call.
1065 */ 1069 */
1066 lock_kernel();
1067 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1070 ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
1068 unlock_kernel();
1069out: 1071out:
1070 dput(parent); 1072 dput(parent);
1071 if (!ret) 1073 if (!ret)
@@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1218 if ((nd->flags & LOOKUP_CREATE) != 0) 1220 if ((nd->flags & LOOKUP_CREATE) != 0)
1219 open_flags = nd->intent.open.flags; 1221 open_flags = nd->intent.open.flags;
1220 1222
1221 lock_kernel();
1222 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); 1223 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
1223 if (error != 0) 1224 if (error != 0)
1224 goto out_err; 1225 goto out_err;
1225 unlock_kernel();
1226 return 0; 1226 return 0;
1227out_err: 1227out_err:
1228 unlock_kernel();
1229 d_drop(dentry); 1228 d_drop(dentry);
1230 return error; 1229 return error;
1231} 1230}
@@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1248 attr.ia_mode = mode; 1247 attr.ia_mode = mode;
1249 attr.ia_valid = ATTR_MODE; 1248 attr.ia_valid = ATTR_MODE;
1250 1249
1251 lock_kernel();
1252 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); 1250 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
1253 if (status != 0) 1251 if (status != 0)
1254 goto out_err; 1252 goto out_err;
1255 unlock_kernel();
1256 return 0; 1253 return 0;
1257out_err: 1254out_err:
1258 unlock_kernel();
1259 d_drop(dentry); 1255 d_drop(dentry);
1260 return status; 1256 return status;
1261} 1257}
@@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1274 attr.ia_valid = ATTR_MODE; 1270 attr.ia_valid = ATTR_MODE;
1275 attr.ia_mode = mode | S_IFDIR; 1271 attr.ia_mode = mode | S_IFDIR;
1276 1272
1277 lock_kernel();
1278 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); 1273 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
1279 if (error != 0) 1274 if (error != 0)
1280 goto out_err; 1275 goto out_err;
1281 unlock_kernel();
1282 return 0; 1276 return 0;
1283out_err: 1277out_err:
1284 d_drop(dentry); 1278 d_drop(dentry);
1285 unlock_kernel();
1286 return error; 1279 return error;
1287} 1280}
1288 1281
@@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1299 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", 1292 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
1300 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1293 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1301 1294
1302 lock_kernel();
1303 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); 1295 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
1304 /* Ensure the VFS deletes this inode */ 1296 /* Ensure the VFS deletes this inode */
1305 if (error == 0 && dentry->d_inode != NULL) 1297 if (error == 0 && dentry->d_inode != NULL)
1306 clear_nlink(dentry->d_inode); 1298 clear_nlink(dentry->d_inode);
1307 else if (error == -ENOENT) 1299 else if (error == -ENOENT)
1308 nfs_dentry_handle_enoent(dentry); 1300 nfs_dentry_handle_enoent(dentry);
1309 unlock_kernel();
1310 1301
1311 return error; 1302 return error;
1312} 1303}
@@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry)
1408 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1399 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1409 /* The VFS may want to delete this inode */ 1400 /* The VFS may want to delete this inode */
1410 if (error == 0) 1401 if (error == 0)
1411 drop_nlink(inode); 1402 nfs_drop_nlink(inode);
1412 nfs_mark_for_revalidate(inode); 1403 nfs_mark_for_revalidate(inode);
1413 } else 1404 } else
1414 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1405 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1431 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, 1422 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
1432 dir->i_ino, dentry->d_name.name); 1423 dir->i_ino, dentry->d_name.name);
1433 1424
1434 lock_kernel();
1435 spin_lock(&dcache_lock); 1425 spin_lock(&dcache_lock);
1436 spin_lock(&dentry->d_lock); 1426 spin_lock(&dentry->d_lock);
1437 if (atomic_read(&dentry->d_count) > 1) { 1427 if (atomic_read(&dentry->d_count) > 1) {
@@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1440 /* Start asynchronous writeout of the inode */ 1430 /* Start asynchronous writeout of the inode */
1441 write_inode_now(dentry->d_inode, 0); 1431 write_inode_now(dentry->d_inode, 0);
1442 error = nfs_sillyrename(dir, dentry); 1432 error = nfs_sillyrename(dir, dentry);
1443 unlock_kernel();
1444 return error; 1433 return error;
1445 } 1434 }
1446 if (!d_unhashed(dentry)) { 1435 if (!d_unhashed(dentry)) {
@@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1454 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1443 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1455 } else if (need_rehash) 1444 } else if (need_rehash)
1456 d_rehash(dentry); 1445 d_rehash(dentry);
1457 unlock_kernel();
1458 return error; 1446 return error;
1459} 1447}
1460 1448
@@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1491 attr.ia_mode = S_IFLNK | S_IRWXUGO; 1479 attr.ia_mode = S_IFLNK | S_IRWXUGO;
1492 attr.ia_valid = ATTR_MODE; 1480 attr.ia_valid = ATTR_MODE;
1493 1481
1494 lock_kernel();
1495
1496 page = alloc_page(GFP_HIGHUSER); 1482 page = alloc_page(GFP_HIGHUSER);
1497 if (!page) { 1483 if (!page)
1498 unlock_kernel();
1499 return -ENOMEM; 1484 return -ENOMEM;
1500 }
1501 1485
1502 kaddr = kmap_atomic(page, KM_USER0); 1486 kaddr = kmap_atomic(page, KM_USER0);
1503 memcpy(kaddr, symname, pathlen); 1487 memcpy(kaddr, symname, pathlen);
@@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1512 dentry->d_name.name, symname, error); 1496 dentry->d_name.name, symname, error);
1513 d_drop(dentry); 1497 d_drop(dentry);
1514 __free_page(page); 1498 __free_page(page);
1515 unlock_kernel();
1516 return error; 1499 return error;
1517 } 1500 }
1518 1501
@@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1530 } else 1513 } else
1531 __free_page(page); 1514 __free_page(page);
1532 1515
1533 unlock_kernel();
1534 return 0; 1516 return 0;
1535} 1517}
1536 1518
@@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1544 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1526 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1545 dentry->d_parent->d_name.name, dentry->d_name.name); 1527 dentry->d_parent->d_name.name, dentry->d_name.name);
1546 1528
1547 lock_kernel();
1548 d_drop(dentry); 1529 d_drop(dentry);
1549 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1530 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1550 if (error == 0) { 1531 if (error == 0) {
1551 atomic_inc(&inode->i_count); 1532 atomic_inc(&inode->i_count);
1552 d_add(dentry, inode); 1533 d_add(dentry, inode);
1553 } 1534 }
1554 unlock_kernel();
1555 return error; 1535 return error;
1556} 1536}
1557 1537
@@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1591 * To prevent any new references to the target during the rename, 1571 * To prevent any new references to the target during the rename,
1592 * we unhash the dentry and free the inode in advance. 1572 * we unhash the dentry and free the inode in advance.
1593 */ 1573 */
1594 lock_kernel();
1595 if (!d_unhashed(new_dentry)) { 1574 if (!d_unhashed(new_dentry)) {
1596 d_drop(new_dentry); 1575 d_drop(new_dentry);
1597 rehash = new_dentry; 1576 rehash = new_dentry;
@@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1635 /* dentry still busy? */ 1614 /* dentry still busy? */
1636 goto out; 1615 goto out;
1637 } else 1616 } else
1638 drop_nlink(new_inode); 1617 nfs_drop_nlink(new_inode);
1639 1618
1640go_ahead: 1619go_ahead:
1641 /* 1620 /*
@@ -1669,7 +1648,6 @@ out:
1669 /* new dentry created? */ 1648 /* new dentry created? */
1670 if (dentry) 1649 if (dentry)
1671 dput(dentry); 1650 dput(dentry);
1672 unlock_kernel();
1673 return error; 1651 return error;
1674} 1652}
1675 1653
@@ -1962,8 +1940,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
1962 } 1940 }
1963 1941
1964force_lookup: 1942force_lookup:
1965 lock_kernel();
1966
1967 if (!NFS_PROTO(inode)->access) 1943 if (!NFS_PROTO(inode)->access)
1968 goto out_notsup; 1944 goto out_notsup;
1969 1945
@@ -1973,7 +1949,6 @@ force_lookup:
1973 put_rpccred(cred); 1949 put_rpccred(cred);
1974 } else 1950 } else
1975 res = PTR_ERR(cred); 1951 res = PTR_ERR(cred);
1976 unlock_kernel();
1977out: 1952out:
1978 dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", 1953 dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
1979 inode->i_sb->s_id, inode->i_ino, mask, res); 1954 inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -1982,7 +1957,6 @@ out_notsup:
1982 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 1957 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
1983 if (res == 0) 1958 if (res == 0)
1984 res = generic_permission(inode, mask, NULL); 1959 res = generic_permission(inode, mask, NULL);
1985 unlock_kernel();
1986 goto out; 1960 goto out;
1987} 1961}
1988 1962
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a1..08f6b040d289 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
890 count = iov_length(iov, nr_segs); 890 count = iov_length(iov, nr_segs);
891 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); 891 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
892 892
893 dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n", 893 dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
894 file->f_path.dentry->d_parent->d_name.name, 894 file->f_path.dentry->d_parent->d_name.name,
895 file->f_path.dentry->d_name.name, 895 file->f_path.dentry->d_name.name,
896 count, (long long) pos); 896 count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
947 count = iov_length(iov, nr_segs); 947 count = iov_length(iov, nr_segs);
948 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); 948 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
949 949
950 dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n", 950 dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
951 file->f_path.dentry->d_parent->d_name.name, 951 file->f_path.dentry->d_parent->d_name.name,
952 file->f_path.dentry->d_name.name, 952 file->f_path.dentry->d_name.name,
953 count, (long long) pos); 953 count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 4e98a56a1777..78460657f5cb 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
50static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 50static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
51 unsigned long nr_segs, loff_t pos); 51 unsigned long nr_segs, loff_t pos);
52static int nfs_file_flush(struct file *, fl_owner_t id); 52static int nfs_file_flush(struct file *, fl_owner_t id);
53static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); 53static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
54static int nfs_check_flags(int flags); 54static int nfs_check_flags(int flags);
55static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 55static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
56static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 56static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
72 .open = nfs_file_open, 72 .open = nfs_file_open,
73 .flush = nfs_file_flush, 73 .flush = nfs_file_flush,
74 .release = nfs_file_release, 74 .release = nfs_file_release,
75 .fsync = nfs_fsync, 75 .fsync = nfs_file_fsync,
76 .lock = nfs_lock, 76 .lock = nfs_lock,
77 .flock = nfs_flock, 77 .flock = nfs_flock,
78 .splice_read = nfs_file_splice_read, 78 .splice_read = nfs_file_splice_read,
@@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp)
119{ 119{
120 int res; 120 int res;
121 121
122 dprintk("NFS: open file(%s/%s)\n",
123 filp->f_path.dentry->d_parent->d_name.name,
124 filp->f_path.dentry->d_name.name);
125
122 res = nfs_check_flags(filp->f_flags); 126 res = nfs_check_flags(filp->f_flags);
123 if (res) 127 if (res)
124 return res; 128 return res;
125 129
126 nfs_inc_stats(inode, NFSIOS_VFSOPEN); 130 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
127 lock_kernel(); 131 res = nfs_open(inode, filp);
128 res = NFS_PROTO(inode)->file_open(inode, filp);
129 unlock_kernel();
130 return res; 132 return res;
131} 133}
132 134
133static int 135static int
134nfs_file_release(struct inode *inode, struct file *filp) 136nfs_file_release(struct inode *inode, struct file *filp)
135{ 137{
138 struct dentry *dentry = filp->f_path.dentry;
139
140 dprintk("NFS: release(%s/%s)\n",
141 dentry->d_parent->d_name.name,
142 dentry->d_name.name);
143
136 /* Ensure that dirty pages are flushed out with the right creds */ 144 /* Ensure that dirty pages are flushed out with the right creds */
137 if (filp->f_mode & FMODE_WRITE) 145 if (filp->f_mode & FMODE_WRITE)
138 nfs_wb_all(filp->f_path.dentry->d_inode); 146 nfs_wb_all(dentry->d_inode);
139 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 147 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
140 return NFS_PROTO(inode)->file_release(inode, filp); 148 return nfs_release(inode, filp);
141} 149}
142 150
143/** 151/**
@@ -171,6 +179,12 @@ force_reval:
171static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 179static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
172{ 180{
173 loff_t loff; 181 loff_t loff;
182
183 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
184 filp->f_path.dentry->d_parent->d_name.name,
185 filp->f_path.dentry->d_name.name,
186 offset, origin);
187
174 /* origin == SEEK_END => we must revalidate the cached file length */ 188 /* origin == SEEK_END => we must revalidate the cached file length */
175 if (origin == SEEK_END) { 189 if (origin == SEEK_END) {
176 struct inode *inode = filp->f_mapping->host; 190 struct inode *inode = filp->f_mapping->host;
@@ -185,7 +199,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
185} 199}
186 200
187/* 201/*
188 * Helper for nfs_file_flush() and nfs_fsync() 202 * Helper for nfs_file_flush() and nfs_file_fsync()
189 * 203 *
190 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to 204 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
191 * disk, but it retrieves and clears ctx->error after synching, despite 205 * disk, but it retrieves and clears ctx->error after synching, despite
@@ -211,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
211 225
212/* 226/*
213 * Flush all dirty pages, and check for write errors. 227 * Flush all dirty pages, and check for write errors.
214 *
215 */ 228 */
216static int 229static int
217nfs_file_flush(struct file *file, fl_owner_t id) 230nfs_file_flush(struct file *file, fl_owner_t id)
218{ 231{
219 struct nfs_open_context *ctx = nfs_file_open_context(file); 232 struct nfs_open_context *ctx = nfs_file_open_context(file);
220 struct inode *inode = file->f_path.dentry->d_inode; 233 struct dentry *dentry = file->f_path.dentry;
234 struct inode *inode = dentry->d_inode;
221 int status; 235 int status;
222 236
223 dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); 237 dprintk("NFS: flush(%s/%s)\n",
238 dentry->d_parent->d_name.name,
239 dentry->d_name.name);
224 240
225 if ((file->f_mode & FMODE_WRITE) == 0) 241 if ((file->f_mode & FMODE_WRITE) == 0)
226 return 0; 242 return 0;
@@ -245,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
245 if (iocb->ki_filp->f_flags & O_DIRECT) 261 if (iocb->ki_filp->f_flags & O_DIRECT)
246 return nfs_file_direct_read(iocb, iov, nr_segs, pos); 262 return nfs_file_direct_read(iocb, iov, nr_segs, pos);
247 263
248 dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", 264 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
249 dentry->d_parent->d_name.name, dentry->d_name.name, 265 dentry->d_parent->d_name.name, dentry->d_name.name,
250 (unsigned long) count, (unsigned long) pos); 266 (unsigned long) count, (unsigned long) pos);
251 267
@@ -265,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
265 struct inode *inode = dentry->d_inode; 281 struct inode *inode = dentry->d_inode;
266 ssize_t res; 282 ssize_t res;
267 283
268 dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n", 284 dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
269 dentry->d_parent->d_name.name, dentry->d_name.name, 285 dentry->d_parent->d_name.name, dentry->d_name.name,
270 (unsigned long) count, (unsigned long long) *ppos); 286 (unsigned long) count, (unsigned long long) *ppos);
271 287
@@ -282,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
282 struct inode *inode = dentry->d_inode; 298 struct inode *inode = dentry->d_inode;
283 int status; 299 int status;
284 300
285 dfprintk(VFS, "nfs: mmap(%s/%s)\n", 301 dprintk("NFS: mmap(%s/%s)\n",
286 dentry->d_parent->d_name.name, dentry->d_name.name); 302 dentry->d_parent->d_name.name, dentry->d_name.name);
287 303
288 status = nfs_revalidate_mapping(inode, file->f_mapping); 304 status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -300,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
300 * whether any write errors occurred for this process. 316 * whether any write errors occurred for this process.
301 */ 317 */
302static int 318static int
303nfs_fsync(struct file *file, struct dentry *dentry, int datasync) 319nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
304{ 320{
305 struct nfs_open_context *ctx = nfs_file_open_context(file); 321 struct nfs_open_context *ctx = nfs_file_open_context(file);
306 struct inode *inode = dentry->d_inode; 322 struct inode *inode = dentry->d_inode;
307 323
308 dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); 324 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
325 dentry->d_parent->d_name.name, dentry->d_name.name,
326 datasync);
309 327
310 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 328 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
311 return nfs_do_fsync(ctx, inode); 329 return nfs_do_fsync(ctx, inode);
@@ -328,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
328 struct page *page; 346 struct page *page;
329 index = pos >> PAGE_CACHE_SHIFT; 347 index = pos >> PAGE_CACHE_SHIFT;
330 348
349 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
350 file->f_path.dentry->d_parent->d_name.name,
351 file->f_path.dentry->d_name.name,
352 mapping->host->i_ino, len, (long long) pos);
353
331 page = __grab_cache_page(mapping, index); 354 page = __grab_cache_page(mapping, index);
332 if (!page) 355 if (!page)
333 return -ENOMEM; 356 return -ENOMEM;
@@ -348,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
348 unsigned offset = pos & (PAGE_CACHE_SIZE - 1); 371 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
349 int status; 372 int status;
350 373
351 lock_kernel(); 374 dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
375 file->f_path.dentry->d_parent->d_name.name,
376 file->f_path.dentry->d_name.name,
377 mapping->host->i_ino, len, (long long) pos);
378
379 /*
380 * Zero any uninitialised parts of the page, and then mark the page
381 * as up to date if it turns out that we're extending the file.
382 */
383 if (!PageUptodate(page)) {
384 unsigned pglen = nfs_page_length(page);
385 unsigned end = offset + len;
386
387 if (pglen == 0) {
388 zero_user_segments(page, 0, offset,
389 end, PAGE_CACHE_SIZE);
390 SetPageUptodate(page);
391 } else if (end >= pglen) {
392 zero_user_segment(page, end, PAGE_CACHE_SIZE);
393 if (offset == 0)
394 SetPageUptodate(page);
395 } else
396 zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
397 }
398
352 status = nfs_updatepage(file, page, offset, copied); 399 status = nfs_updatepage(file, page, offset, copied);
353 unlock_kernel();
354 400
355 unlock_page(page); 401 unlock_page(page);
356 page_cache_release(page); 402 page_cache_release(page);
@@ -362,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
362 408
363static void nfs_invalidate_page(struct page *page, unsigned long offset) 409static void nfs_invalidate_page(struct page *page, unsigned long offset)
364{ 410{
411 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
412
365 if (offset != 0) 413 if (offset != 0)
366 return; 414 return;
367 /* Cancel any unstarted writes on this page */ 415 /* Cancel any unstarted writes on this page */
@@ -370,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
370 418
371static int nfs_release_page(struct page *page, gfp_t gfp) 419static int nfs_release_page(struct page *page, gfp_t gfp)
372{ 420{
421 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
422
373 /* If PagePrivate() is set, then the page is not freeable */ 423 /* If PagePrivate() is set, then the page is not freeable */
374 return 0; 424 return 0;
375} 425}
376 426
377static int nfs_launder_page(struct page *page) 427static int nfs_launder_page(struct page *page)
378{ 428{
379 return nfs_wb_page(page->mapping->host, page); 429 struct inode *inode = page->mapping->host;
430
431 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
432 inode->i_ino, (long long)page_offset(page));
433
434 return nfs_wb_page(inode, page);
380} 435}
381 436
382const struct address_space_operations nfs_file_aops = { 437const struct address_space_operations nfs_file_aops = {
@@ -396,13 +451,19 @@ const struct address_space_operations nfs_file_aops = {
396static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 451static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
397{ 452{
398 struct file *filp = vma->vm_file; 453 struct file *filp = vma->vm_file;
454 struct dentry *dentry = filp->f_path.dentry;
399 unsigned pagelen; 455 unsigned pagelen;
400 int ret = -EINVAL; 456 int ret = -EINVAL;
401 struct address_space *mapping; 457 struct address_space *mapping;
402 458
459 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
460 dentry->d_parent->d_name.name, dentry->d_name.name,
461 filp->f_mapping->host->i_ino,
462 (long long)page_offset(page));
463
403 lock_page(page); 464 lock_page(page);
404 mapping = page->mapping; 465 mapping = page->mapping;
405 if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) 466 if (mapping != dentry->d_inode->i_mapping)
406 goto out_unlock; 467 goto out_unlock;
407 468
408 ret = 0; 469 ret = 0;
@@ -450,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
450 if (iocb->ki_filp->f_flags & O_DIRECT) 511 if (iocb->ki_filp->f_flags & O_DIRECT)
451 return nfs_file_direct_write(iocb, iov, nr_segs, pos); 512 return nfs_file_direct_write(iocb, iov, nr_segs, pos);
452 513
453 dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n", 514 dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
454 dentry->d_parent->d_name.name, dentry->d_name.name, 515 dentry->d_parent->d_name.name, dentry->d_name.name,
455 inode->i_ino, (unsigned long) count, (long long) pos); 516 (unsigned long) count, (long long) pos);
456 517
457 result = -EBUSY; 518 result = -EBUSY;
458 if (IS_SWAPFILE(inode)) 519 if (IS_SWAPFILE(inode))
@@ -586,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
586 * This makes locking act as a cache coherency point. 647 * This makes locking act as a cache coherency point.
587 */ 648 */
588 nfs_sync_mapping(filp->f_mapping); 649 nfs_sync_mapping(filp->f_mapping);
589 nfs_zap_caches(inode); 650 if (!nfs_have_delegation(inode, FMODE_READ))
651 nfs_zap_caches(inode);
590out: 652out:
591 return status; 653 return status;
592} 654}
@@ -596,23 +658,35 @@ out:
596 */ 658 */
597static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) 659static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
598{ 660{
599 struct inode * inode = filp->f_mapping->host; 661 struct inode *inode = filp->f_mapping->host;
662 int ret = -ENOLCK;
600 663
601 dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", 664 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
602 inode->i_sb->s_id, inode->i_ino, 665 filp->f_path.dentry->d_parent->d_name.name,
666 filp->f_path.dentry->d_name.name,
603 fl->fl_type, fl->fl_flags, 667 fl->fl_type, fl->fl_flags,
604 (long long)fl->fl_start, (long long)fl->fl_end); 668 (long long)fl->fl_start, (long long)fl->fl_end);
669
605 nfs_inc_stats(inode, NFSIOS_VFSLOCK); 670 nfs_inc_stats(inode, NFSIOS_VFSLOCK);
606 671
607 /* No mandatory locks over NFS */ 672 /* No mandatory locks over NFS */
608 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 673 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
609 return -ENOLCK; 674 goto out_err;
675
676 if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
677 ret = NFS_PROTO(inode)->lock_check_bounds(fl);
678 if (ret < 0)
679 goto out_err;
680 }
610 681
611 if (IS_GETLK(cmd)) 682 if (IS_GETLK(cmd))
612 return do_getlk(filp, cmd, fl); 683 ret = do_getlk(filp, cmd, fl);
613 if (fl->fl_type == F_UNLCK) 684 else if (fl->fl_type == F_UNLCK)
614 return do_unlk(filp, cmd, fl); 685 ret = do_unlk(filp, cmd, fl);
615 return do_setlk(filp, cmd, fl); 686 else
687 ret = do_setlk(filp, cmd, fl);
688out_err:
689 return ret;
616} 690}
617 691
618/* 692/*
@@ -620,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
620 */ 694 */
621static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) 695static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
622{ 696{
623 dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", 697 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
624 filp->f_path.dentry->d_inode->i_sb->s_id, 698 filp->f_path.dentry->d_parent->d_name.name,
625 filp->f_path.dentry->d_inode->i_ino, 699 filp->f_path.dentry->d_name.name,
626 fl->fl_type, fl->fl_flags); 700 fl->fl_type, fl->fl_flags);
627 701
628 /* 702 /*
@@ -645,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
645 return do_setlk(filp, cmd, fl); 719 return do_setlk(filp, cmd, fl);
646} 720}
647 721
722/*
723 * There is no protocol support for leases, so we have no way to implement
724 * them correctly in the face of opens by other clients.
725 */
648static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) 726static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
649{ 727{
650 /* 728 dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
651 * There is no protocol support for leases, so we have no way 729 file->f_path.dentry->d_parent->d_name.name,
652 * to implement them correctly in the face of opens by other 730 file->f_path.dentry->d_name.name, arg);
653 * clients. 731
654 */
655 return -EINVAL; 732 return -EINVAL;
656} 733}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f4..df23f987da6b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
57static void nfs_invalidate_inode(struct inode *); 57static void nfs_invalidate_inode(struct inode *);
58static int nfs_update_inode(struct inode *, struct nfs_fattr *); 58static int nfs_update_inode(struct inode *, struct nfs_fattr *);
59 59
60static void nfs_zap_acl_cache(struct inode *);
61
62static struct kmem_cache * nfs_inode_cachep; 60static struct kmem_cache * nfs_inode_cachep;
63 61
64static inline unsigned long 62static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
167 } 165 }
168} 166}
169 167
170static void nfs_zap_acl_cache(struct inode *inode) 168void nfs_zap_acl_cache(struct inode *inode)
171{ 169{
172 void (*clear_acl_cache)(struct inode *); 170 void (*clear_acl_cache)(struct inode *);
173 171
@@ -347,7 +345,7 @@ out_no_inode:
347 goto out; 345 goto out;
348} 346}
349 347
350#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET) 348#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
351 349
352int 350int
353nfs_setattr(struct dentry *dentry, struct iattr *attr) 351nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
369 367
370 /* Optimization: if the end result is no change, don't RPC */ 368 /* Optimization: if the end result is no change, don't RPC */
371 attr->ia_valid &= NFS_VALID_ATTRS; 369 attr->ia_valid &= NFS_VALID_ATTRS;
372 if (attr->ia_valid == 0) 370 if ((attr->ia_valid & ~ATTR_FILE) == 0)
373 return 0; 371 return 0;
374 372
375 lock_kernel();
376 /* Write all dirty data */ 373 /* Write all dirty data */
377 if (S_ISREG(inode->i_mode)) { 374 if (S_ISREG(inode->i_mode)) {
378 filemap_write_and_wait(inode->i_mapping); 375 filemap_write_and_wait(inode->i_mapping);
@@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
386 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 383 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
387 if (error == 0) 384 if (error == 0)
388 nfs_refresh_inode(inode, &fattr); 385 nfs_refresh_inode(inode, &fattr);
389 unlock_kernel();
390 return error; 386 return error;
391} 387}
392 388
393/** 389/**
390 * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
391 * @inode: inode of the file used
392 * @offset: file offset to start truncating
393 *
394 * This is a copy of the common vmtruncate, but with the locking
395 * corrected to take into account the fact that NFS requires
396 * inode->i_size to be updated under the inode->i_lock.
397 */
398static int nfs_vmtruncate(struct inode * inode, loff_t offset)
399{
400 if (i_size_read(inode) < offset) {
401 unsigned long limit;
402
403 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
404 if (limit != RLIM_INFINITY && offset > limit)
405 goto out_sig;
406 if (offset > inode->i_sb->s_maxbytes)
407 goto out_big;
408 spin_lock(&inode->i_lock);
409 i_size_write(inode, offset);
410 spin_unlock(&inode->i_lock);
411 } else {
412 struct address_space *mapping = inode->i_mapping;
413
414 /*
415 * truncation of in-use swapfiles is disallowed - it would
416 * cause subsequent swapout to scribble on the now-freed
417 * blocks.
418 */
419 if (IS_SWAPFILE(inode))
420 return -ETXTBSY;
421 spin_lock(&inode->i_lock);
422 i_size_write(inode, offset);
423 spin_unlock(&inode->i_lock);
424
425 /*
426 * unmap_mapping_range is called twice, first simply for
427 * efficiency so that truncate_inode_pages does fewer
428 * single-page unmaps. However after this first call, and
429 * before truncate_inode_pages finishes, it is possible for
430 * private pages to be COWed, which remain after
431 * truncate_inode_pages finishes, hence the second
432 * unmap_mapping_range call must be made for correctness.
433 */
434 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
435 truncate_inode_pages(mapping, offset);
436 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
437 }
438 return 0;
439out_sig:
440 send_sig(SIGXFSZ, current, 0);
441out_big:
442 return -EFBIG;
443}
444
445/**
394 * nfs_setattr_update_inode - Update inode metadata after a setattr call. 446 * nfs_setattr_update_inode - Update inode metadata after a setattr call.
395 * @inode: pointer to struct inode 447 * @inode: pointer to struct inode
396 * @attr: pointer to struct iattr 448 * @attr: pointer to struct iattr
@@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
416 } 468 }
417 if ((attr->ia_valid & ATTR_SIZE) != 0) { 469 if ((attr->ia_valid & ATTR_SIZE) != 0) {
418 nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); 470 nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
419 inode->i_size = attr->ia_size; 471 nfs_vmtruncate(inode, attr->ia_size);
420 vmtruncate(inode, attr->ia_size);
421 } 472 }
422} 473}
423 474
@@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
647 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 698 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
648 699
649 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 700 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
650 lock_kernel();
651 if (is_bad_inode(inode)) 701 if (is_bad_inode(inode))
652 goto out_nowait; 702 goto out_nowait;
653 if (NFS_STALE(inode)) 703 if (NFS_STALE(inode))
@@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
696 nfs_wake_up_inode(inode); 746 nfs_wake_up_inode(inode);
697 747
698 out_nowait: 748 out_nowait:
699 unlock_kernel();
700 return status; 749 return status;
701} 750}
702 751
@@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
831 if (S_ISDIR(inode->i_mode)) 880 if (S_ISDIR(inode->i_mode))
832 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 881 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
833 } 882 }
834 if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) && 883 if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
835 nfsi->npages == 0) 884 nfsi->npages == 0)
836 inode->i_size = nfs_size_to_loff_t(fattr->size); 885 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
837 } 886 }
838} 887}
839 888
@@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
974 (fattr->valid & NFS_ATTR_WCC) == 0) { 1023 (fattr->valid & NFS_ATTR_WCC) == 0) {
975 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); 1024 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
976 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); 1025 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
977 fattr->pre_size = inode->i_size; 1026 fattr->pre_size = i_size_read(inode);
978 fattr->valid |= NFS_ATTR_WCC; 1027 fattr->valid |= NFS_ATTR_WCC;
979 } 1028 }
980 return nfs_post_op_update_inode(inode, fattr); 1029 return nfs_post_op_update_inode(inode, fattr);
@@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1059 /* Do we perhaps have any outstanding writes, or has 1108 /* Do we perhaps have any outstanding writes, or has
1060 * the file grown beyond our last write? */ 1109 * the file grown beyond our last write? */
1061 if (nfsi->npages == 0 || new_isize > cur_isize) { 1110 if (nfsi->npages == 0 || new_isize > cur_isize) {
1062 inode->i_size = new_isize; 1111 i_size_write(inode, new_isize);
1063 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1112 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1064 } 1113 }
1065 dprintk("NFS: isize change on server for file %s/%ld\n", 1114 dprintk("NFS: isize change on server for file %s/%ld\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddba..24241fcbb98d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
150#ifdef CONFIG_NFS_V4 150#ifdef CONFIG_NFS_V4
151extern void nfs4_clear_inode(struct inode *); 151extern void nfs4_clear_inode(struct inode *);
152#endif 152#endif
153void nfs_zap_acl_cache(struct inode *inode);
153 154
154/* super.c */ 155/* super.c */
155extern struct file_system_type nfs_xdev_fs_type; 156extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde589..a36952810032 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
5 * 5 *
6 * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com> 6 * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
7 * 7 *
8 * NFS client per-mount statistics provide information about the health of
9 * the NFS client and the health of each NFS mount point. Generally these
10 * are not for detailed problem diagnosis, but simply to indicate that there
11 * is a problem.
12 *
13 * These counters are not meant to be human-readable, but are meant to be
14 * integrated into system monitoring tools such as "sar" and "iostat". As
15 * such, the counters are sampled by the tools over time, and are never
16 * zeroed after a file system is mounted. Moving averages can be computed
17 * by the tools by taking the difference between two instantaneous samples
18 * and dividing that by the time between the samples.
19 */ 8 */
20 9
21#ifndef _NFS_IOSTAT 10#ifndef _NFS_IOSTAT
22#define _NFS_IOSTAT 11#define _NFS_IOSTAT
23 12
24#define NFS_IOSTAT_VERS "1.0"
25
26/*
27 * NFS byte counters
28 *
29 * 1. SERVER - the number of payload bytes read from or written to the
30 * server by the NFS client via an NFS READ or WRITE request.
31 *
32 * 2. NORMAL - the number of bytes read or written by applications via
33 * the read(2) and write(2) system call interfaces.
34 *
35 * 3. DIRECT - the number of bytes read or written from files opened
36 * with the O_DIRECT flag.
37 *
38 * These counters give a view of the data throughput into and out of the NFS
39 * client. Comparing the number of bytes requested by an application with the
40 * number of bytes the client requests from the server can provide an
41 * indication of client efficiency (per-op, cache hits, etc).
42 *
43 * These counters can also help characterize which access methods are in
44 * use. DIRECT by itself shows whether there is any O_DIRECT traffic.
45 * NORMAL + DIRECT shows how much data is going through the system call
46 * interface. A large amount of SERVER traffic without much NORMAL or
47 * DIRECT traffic shows that applications are using mapped files.
48 *
49 * NFS page counters
50 *
51 * These count the number of pages read or written via nfs_readpage(),
52 * nfs_readpages(), or their write equivalents.
53 */
54enum nfs_stat_bytecounters {
55 NFSIOS_NORMALREADBYTES = 0,
56 NFSIOS_NORMALWRITTENBYTES,
57 NFSIOS_DIRECTREADBYTES,
58 NFSIOS_DIRECTWRITTENBYTES,
59 NFSIOS_SERVERREADBYTES,
60 NFSIOS_SERVERWRITTENBYTES,
61 NFSIOS_READPAGES,
62 NFSIOS_WRITEPAGES,
63 __NFSIOS_BYTESMAX,
64};
65
66/*
67 * NFS event counters
68 *
69 * These counters provide a low-overhead way of monitoring client activity
70 * without enabling NFS trace debugging. The counters show the rate at
71 * which VFS requests are made, and how often the client invalidates its
72 * data and attribute caches. This allows system administrators to monitor
73 * such things as how close-to-open is working, and answer questions such
74 * as "why are there so many GETATTR requests on the wire?"
75 *
76 * They also count anamolous events such as short reads and writes, silly
77 * renames due to close-after-delete, and operations that change the size
78 * of a file (such operations can often be the source of data corruption
79 * if applications aren't using file locking properly).
80 */
81enum nfs_stat_eventcounters {
82 NFSIOS_INODEREVALIDATE = 0,
83 NFSIOS_DENTRYREVALIDATE,
84 NFSIOS_DATAINVALIDATE,
85 NFSIOS_ATTRINVALIDATE,
86 NFSIOS_VFSOPEN,
87 NFSIOS_VFSLOOKUP,
88 NFSIOS_VFSACCESS,
89 NFSIOS_VFSUPDATEPAGE,
90 NFSIOS_VFSREADPAGE,
91 NFSIOS_VFSREADPAGES,
92 NFSIOS_VFSWRITEPAGE,
93 NFSIOS_VFSWRITEPAGES,
94 NFSIOS_VFSGETDENTS,
95 NFSIOS_VFSSETATTR,
96 NFSIOS_VFSFLUSH,
97 NFSIOS_VFSFSYNC,
98 NFSIOS_VFSLOCK,
99 NFSIOS_VFSRELEASE,
100 NFSIOS_CONGESTIONWAIT,
101 NFSIOS_SETATTRTRUNC,
102 NFSIOS_EXTENDWRITE,
103 NFSIOS_SILLYRENAME,
104 NFSIOS_SHORTREAD,
105 NFSIOS_SHORTWRITE,
106 NFSIOS_DELAY,
107 __NFSIOS_COUNTSMAX,
108};
109
110#ifdef __KERNEL__
111
112#include <linux/percpu.h> 13#include <linux/percpu.h>
113#include <linux/cache.h> 14#include <linux/cache.h>
15#include <linux/nfs_iostat.h>
114 16
115struct nfs_iostats { 17struct nfs_iostats {
116 unsigned long long bytes[__NFSIOS_BYTESMAX]; 18 unsigned long long bytes[__NFSIOS_BYTESMAX];
117 unsigned long events[__NFSIOS_COUNTSMAX]; 19 unsigned long events[__NFSIOS_COUNTSMAX];
118} ____cacheline_aligned; 20} ____cacheline_aligned;
119 21
120static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat) 22static inline void nfs_inc_server_stats(const struct nfs_server *server,
23 enum nfs_stat_eventcounters stat)
121{ 24{
122 struct nfs_iostats *iostats; 25 struct nfs_iostats *iostats;
123 int cpu; 26 int cpu;
124 27
125 cpu = get_cpu(); 28 cpu = get_cpu();
126 iostats = per_cpu_ptr(server->io_stats, cpu); 29 iostats = per_cpu_ptr(server->io_stats, cpu);
127 iostats->events[stat] ++; 30 iostats->events[stat]++;
128 put_cpu_no_resched(); 31 put_cpu_no_resched();
129} 32}
130 33
131static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat) 34static inline void nfs_inc_stats(const struct inode *inode,
35 enum nfs_stat_eventcounters stat)
132{ 36{
133 nfs_inc_server_stats(NFS_SERVER(inode), stat); 37 nfs_inc_server_stats(NFS_SERVER(inode), stat);
134} 38}
135 39
136static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) 40static inline void nfs_add_server_stats(const struct nfs_server *server,
41 enum nfs_stat_bytecounters stat,
42 unsigned long addend)
137{ 43{
138 struct nfs_iostats *iostats; 44 struct nfs_iostats *iostats;
139 int cpu; 45 int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
144 put_cpu_no_resched(); 50 put_cpu_no_resched();
145} 51}
146 52
147static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) 53static inline void nfs_add_stats(const struct inode *inode,
54 enum nfs_stat_bytecounters stat,
55 unsigned long addend)
148{ 56{
149 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 57 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
150} 58}
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
160 free_percpu(stats); 68 free_percpu(stats);
161} 69}
162 70
163#endif 71#endif /* _NFS_IOSTAT */
164#endif
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0c..423842f51ac9 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
5#include <linux/posix_acl_xattr.h> 5#include <linux/posix_acl_xattr.h>
6#include <linux/nfsacl.h> 6#include <linux/nfsacl.h>
7 7
8#include "internal.h"
9
8#define NFSDBG_FACILITY NFSDBG_PROC 10#define NFSDBG_FACILITY NFSDBG_PROC
9 11
10ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) 12ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
205 status = nfs_revalidate_inode(server, inode); 207 status = nfs_revalidate_inode(server, inode);
206 if (status < 0) 208 if (status < 0)
207 return ERR_PTR(status); 209 return ERR_PTR(status);
210 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
211 nfs_zap_acl_cache(inode);
208 acl = nfs3_get_cached_acl(inode, type); 212 acl = nfs3_get_cached_acl(inode, type);
209 if (acl != ERR_PTR(-EAGAIN)) 213 if (acl != ERR_PTR(-EAGAIN))
210 return acl; 214 return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
319 dprintk("NFS call setacl\n"); 323 dprintk("NFS call setacl\n");
320 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 324 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
321 status = rpc_call_sync(server->client_acl, &msg, 0); 325 status = rpc_call_sync(server->client_acl, &msg, 0);
322 spin_lock(&inode->i_lock); 326 nfs_access_zap_cache(inode);
323 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; 327 nfs_zap_acl_cache(inode);
324 spin_unlock(&inode->i_lock);
325 dprintk("NFS reply setacl: %d\n", status); 328 dprintk("NFS reply setacl: %d\n", status);
326 329
327 /* pages may have been allocated at the xdr layer. */ 330 /* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed1..1e750e4574a9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
129 int status; 129 int status;
130 130
131 dprintk("NFS call setattr\n"); 131 dprintk("NFS call setattr\n");
132 if (sattr->ia_valid & ATTR_FILE)
133 msg.rpc_cred = nfs_file_cred(sattr->ia_file);
132 nfs_fattr_init(fattr); 134 nfs_fattr_init(fattr);
133 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 135 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
134 if (status == 0) 136 if (status == 0)
@@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
248 return status; 250 return status;
249} 251}
250 252
253struct nfs3_createdata {
254 struct rpc_message msg;
255 union {
256 struct nfs3_createargs create;
257 struct nfs3_mkdirargs mkdir;
258 struct nfs3_symlinkargs symlink;
259 struct nfs3_mknodargs mknod;
260 } arg;
261 struct nfs3_diropres res;
262 struct nfs_fh fh;
263 struct nfs_fattr fattr;
264 struct nfs_fattr dir_attr;
265};
266
267static struct nfs3_createdata *nfs3_alloc_createdata(void)
268{
269 struct nfs3_createdata *data;
270
271 data = kzalloc(sizeof(*data), GFP_KERNEL);
272 if (data != NULL) {
273 data->msg.rpc_argp = &data->arg;
274 data->msg.rpc_resp = &data->res;
275 data->res.fh = &data->fh;
276 data->res.fattr = &data->fattr;
277 data->res.dir_attr = &data->dir_attr;
278 nfs_fattr_init(data->res.fattr);
279 nfs_fattr_init(data->res.dir_attr);
280 }
281 return data;
282}
283
284static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
285{
286 int status;
287
288 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
289 nfs_post_op_update_inode(dir, data->res.dir_attr);
290 if (status == 0)
291 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
292 return status;
293}
294
295static void nfs3_free_createdata(struct nfs3_createdata *data)
296{
297 kfree(data);
298}
299
251/* 300/*
252 * Create a regular file. 301 * Create a regular file.
253 * For now, we don't implement O_EXCL. 302 * For now, we don't implement O_EXCL.
@@ -256,70 +305,60 @@ static int
256nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 305nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
257 int flags, struct nameidata *nd) 306 int flags, struct nameidata *nd)
258{ 307{
259 struct nfs_fh fhandle; 308 struct nfs3_createdata *data;
260 struct nfs_fattr fattr;
261 struct nfs_fattr dir_attr;
262 struct nfs3_createargs arg = {
263 .fh = NFS_FH(dir),
264 .name = dentry->d_name.name,
265 .len = dentry->d_name.len,
266 .sattr = sattr,
267 };
268 struct nfs3_diropres res = {
269 .dir_attr = &dir_attr,
270 .fh = &fhandle,
271 .fattr = &fattr
272 };
273 struct rpc_message msg = {
274 .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE],
275 .rpc_argp = &arg,
276 .rpc_resp = &res,
277 };
278 mode_t mode = sattr->ia_mode; 309 mode_t mode = sattr->ia_mode;
279 int status; 310 int status = -ENOMEM;
280 311
281 dprintk("NFS call create %s\n", dentry->d_name.name); 312 dprintk("NFS call create %s\n", dentry->d_name.name);
282 arg.createmode = NFS3_CREATE_UNCHECKED; 313
314 data = nfs3_alloc_createdata();
315 if (data == NULL)
316 goto out;
317
318 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
319 data->arg.create.fh = NFS_FH(dir);
320 data->arg.create.name = dentry->d_name.name;
321 data->arg.create.len = dentry->d_name.len;
322 data->arg.create.sattr = sattr;
323
324 data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
283 if (flags & O_EXCL) { 325 if (flags & O_EXCL) {
284 arg.createmode = NFS3_CREATE_EXCLUSIVE; 326 data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
285 arg.verifier[0] = jiffies; 327 data->arg.create.verifier[0] = jiffies;
286 arg.verifier[1] = current->pid; 328 data->arg.create.verifier[1] = current->pid;
287 } 329 }
288 330
289 sattr->ia_mode &= ~current->fs->umask; 331 sattr->ia_mode &= ~current->fs->umask;
290 332
291again: 333 for (;;) {
292 nfs_fattr_init(&dir_attr); 334 status = nfs3_do_create(dir, dentry, data);
293 nfs_fattr_init(&fattr);
294 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
295 nfs_refresh_inode(dir, &dir_attr);
296 335
297 /* If the server doesn't support the exclusive creation semantics, 336 if (status != -ENOTSUPP)
298 * try again with simple 'guarded' mode. */ 337 break;
299 if (status == -ENOTSUPP) { 338 /* If the server doesn't support the exclusive creation
300 switch (arg.createmode) { 339 * semantics, try again with simple 'guarded' mode. */
340 switch (data->arg.create.createmode) {
301 case NFS3_CREATE_EXCLUSIVE: 341 case NFS3_CREATE_EXCLUSIVE:
302 arg.createmode = NFS3_CREATE_GUARDED; 342 data->arg.create.createmode = NFS3_CREATE_GUARDED;
303 break; 343 break;
304 344
305 case NFS3_CREATE_GUARDED: 345 case NFS3_CREATE_GUARDED:
306 arg.createmode = NFS3_CREATE_UNCHECKED; 346 data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
307 break; 347 break;
308 348
309 case NFS3_CREATE_UNCHECKED: 349 case NFS3_CREATE_UNCHECKED:
310 goto out; 350 goto out;
311 } 351 }
312 goto again; 352 nfs_fattr_init(data->res.dir_attr);
353 nfs_fattr_init(data->res.fattr);
313 } 354 }
314 355
315 if (status == 0)
316 status = nfs_instantiate(dentry, &fhandle, &fattr);
317 if (status != 0) 356 if (status != 0)
318 goto out; 357 goto out;
319 358
320 /* When we created the file with exclusive semantics, make 359 /* When we created the file with exclusive semantics, make
321 * sure we set the attributes afterwards. */ 360 * sure we set the attributes afterwards. */
322 if (arg.createmode == NFS3_CREATE_EXCLUSIVE) { 361 if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
323 dprintk("NFS call setattr (post-create)\n"); 362 dprintk("NFS call setattr (post-create)\n");
324 363
325 if (!(sattr->ia_valid & ATTR_ATIME_SET)) 364 if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +369,15 @@ again:
330 /* Note: we could use a guarded setattr here, but I'm 369 /* Note: we could use a guarded setattr here, but I'm
331 * not sure this buys us anything (and I'd have 370 * not sure this buys us anything (and I'd have
332 * to revamp the NFSv3 XDR code) */ 371 * to revamp the NFSv3 XDR code) */
333 status = nfs3_proc_setattr(dentry, &fattr, sattr); 372 status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
334 nfs_post_op_update_inode(dentry->d_inode, &fattr); 373 nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
335 dprintk("NFS reply setattr (post-create): %d\n", status); 374 dprintk("NFS reply setattr (post-create): %d\n", status);
375 if (status != 0)
376 goto out;
336 } 377 }
337 if (status != 0)
338 goto out;
339 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 378 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
340out: 379out:
380 nfs3_free_createdata(data);
341 dprintk("NFS reply create: %d\n", status); 381 dprintk("NFS reply create: %d\n", status);
342 return status; 382 return status;
343} 383}
@@ -452,40 +492,28 @@ static int
452nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, 492nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
453 unsigned int len, struct iattr *sattr) 493 unsigned int len, struct iattr *sattr)
454{ 494{
455 struct nfs_fh fhandle; 495 struct nfs3_createdata *data;
456 struct nfs_fattr fattr, dir_attr; 496 int status = -ENOMEM;
457 struct nfs3_symlinkargs arg = {
458 .fromfh = NFS_FH(dir),
459 .fromname = dentry->d_name.name,
460 .fromlen = dentry->d_name.len,
461 .pages = &page,
462 .pathlen = len,
463 .sattr = sattr
464 };
465 struct nfs3_diropres res = {
466 .dir_attr = &dir_attr,
467 .fh = &fhandle,
468 .fattr = &fattr
469 };
470 struct rpc_message msg = {
471 .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK],
472 .rpc_argp = &arg,
473 .rpc_resp = &res,
474 };
475 int status;
476 497
477 if (len > NFS3_MAXPATHLEN) 498 if (len > NFS3_MAXPATHLEN)
478 return -ENAMETOOLONG; 499 return -ENAMETOOLONG;
479 500
480 dprintk("NFS call symlink %s\n", dentry->d_name.name); 501 dprintk("NFS call symlink %s\n", dentry->d_name.name);
481 502
482 nfs_fattr_init(&dir_attr); 503 data = nfs3_alloc_createdata();
483 nfs_fattr_init(&fattr); 504 if (data == NULL)
484 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
485 nfs_post_op_update_inode(dir, &dir_attr);
486 if (status != 0)
487 goto out; 505 goto out;
488 status = nfs_instantiate(dentry, &fhandle, &fattr); 506 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
507 data->arg.symlink.fromfh = NFS_FH(dir);
508 data->arg.symlink.fromname = dentry->d_name.name;
509 data->arg.symlink.fromlen = dentry->d_name.len;
510 data->arg.symlink.pages = &page;
511 data->arg.symlink.pathlen = len;
512 data->arg.symlink.sattr = sattr;
513
514 status = nfs3_do_create(dir, dentry, data);
515
516 nfs3_free_createdata(data);
489out: 517out:
490 dprintk("NFS reply symlink: %d\n", status); 518 dprintk("NFS reply symlink: %d\n", status);
491 return status; 519 return status;
@@ -494,42 +522,31 @@ out:
494static int 522static int
495nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 523nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
496{ 524{
497 struct nfs_fh fhandle; 525 struct nfs3_createdata *data;
498 struct nfs_fattr fattr, dir_attr;
499 struct nfs3_mkdirargs arg = {
500 .fh = NFS_FH(dir),
501 .name = dentry->d_name.name,
502 .len = dentry->d_name.len,
503 .sattr = sattr
504 };
505 struct nfs3_diropres res = {
506 .dir_attr = &dir_attr,
507 .fh = &fhandle,
508 .fattr = &fattr
509 };
510 struct rpc_message msg = {
511 .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR],
512 .rpc_argp = &arg,
513 .rpc_resp = &res,
514 };
515 int mode = sattr->ia_mode; 526 int mode = sattr->ia_mode;
516 int status; 527 int status = -ENOMEM;
517 528
518 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 529 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
519 530
520 sattr->ia_mode &= ~current->fs->umask; 531 sattr->ia_mode &= ~current->fs->umask;
521 532
522 nfs_fattr_init(&dir_attr); 533 data = nfs3_alloc_createdata();
523 nfs_fattr_init(&fattr); 534 if (data == NULL)
524 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
525 nfs_post_op_update_inode(dir, &dir_attr);
526 if (status != 0)
527 goto out; 535 goto out;
528 status = nfs_instantiate(dentry, &fhandle, &fattr); 536
537 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
538 data->arg.mkdir.fh = NFS_FH(dir);
539 data->arg.mkdir.name = dentry->d_name.name;
540 data->arg.mkdir.len = dentry->d_name.len;
541 data->arg.mkdir.sattr = sattr;
542
543 status = nfs3_do_create(dir, dentry, data);
529 if (status != 0) 544 if (status != 0)
530 goto out; 545 goto out;
546
531 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 547 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
532out: 548out:
549 nfs3_free_createdata(data);
533 dprintk("NFS reply mkdir: %d\n", status); 550 dprintk("NFS reply mkdir: %d\n", status);
534 return status; 551 return status;
535} 552}
@@ -615,52 +632,50 @@ static int
615nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 632nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
616 dev_t rdev) 633 dev_t rdev)
617{ 634{
618 struct nfs_fh fh; 635 struct nfs3_createdata *data;
619 struct nfs_fattr fattr, dir_attr;
620 struct nfs3_mknodargs arg = {
621 .fh = NFS_FH(dir),
622 .name = dentry->d_name.name,
623 .len = dentry->d_name.len,
624 .sattr = sattr,
625 .rdev = rdev
626 };
627 struct nfs3_diropres res = {
628 .dir_attr = &dir_attr,
629 .fh = &fh,
630 .fattr = &fattr
631 };
632 struct rpc_message msg = {
633 .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD],
634 .rpc_argp = &arg,
635 .rpc_resp = &res,
636 };
637 mode_t mode = sattr->ia_mode; 636 mode_t mode = sattr->ia_mode;
638 int status; 637 int status = -ENOMEM;
639
640 switch (sattr->ia_mode & S_IFMT) {
641 case S_IFBLK: arg.type = NF3BLK; break;
642 case S_IFCHR: arg.type = NF3CHR; break;
643 case S_IFIFO: arg.type = NF3FIFO; break;
644 case S_IFSOCK: arg.type = NF3SOCK; break;
645 default: return -EINVAL;
646 }
647 638
648 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
649 MAJOR(rdev), MINOR(rdev)); 640 MAJOR(rdev), MINOR(rdev));
650 641
651 sattr->ia_mode &= ~current->fs->umask; 642 sattr->ia_mode &= ~current->fs->umask;
652 643
653 nfs_fattr_init(&dir_attr); 644 data = nfs3_alloc_createdata();
654 nfs_fattr_init(&fattr); 645 if (data == NULL)
655 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
656 nfs_post_op_update_inode(dir, &dir_attr);
657 if (status != 0)
658 goto out; 646 goto out;
659 status = nfs_instantiate(dentry, &fh, &fattr); 647
648 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
649 data->arg.mknod.fh = NFS_FH(dir);
650 data->arg.mknod.name = dentry->d_name.name;
651 data->arg.mknod.len = dentry->d_name.len;
652 data->arg.mknod.sattr = sattr;
653 data->arg.mknod.rdev = rdev;
654
655 switch (sattr->ia_mode & S_IFMT) {
656 case S_IFBLK:
657 data->arg.mknod.type = NF3BLK;
658 break;
659 case S_IFCHR:
660 data->arg.mknod.type = NF3CHR;
661 break;
662 case S_IFIFO:
663 data->arg.mknod.type = NF3FIFO;
664 break;
665 case S_IFSOCK:
666 data->arg.mknod.type = NF3SOCK;
667 break;
668 default:
669 status = -EINVAL;
670 goto out;
671 }
672
673 status = nfs3_do_create(dir, dentry, data);
660 if (status != 0) 674 if (status != 0)
661 goto out; 675 goto out;
662 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 676 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
663out: 677out:
678 nfs3_free_createdata(data);
664 dprintk("NFS reply mknod: %d\n", status); 679 dprintk("NFS reply mknod: %d\n", status);
665 return status; 680 return status;
666} 681}
@@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
801 .write_done = nfs3_write_done, 816 .write_done = nfs3_write_done,
802 .commit_setup = nfs3_proc_commit_setup, 817 .commit_setup = nfs3_proc_commit_setup,
803 .commit_done = nfs3_commit_done, 818 .commit_done = nfs3_commit_done,
804 .file_open = nfs_open,
805 .file_release = nfs_release,
806 .lock = nfs3_proc_lock, 819 .lock = nfs3_proc_lock,
807 .clear_acl_cache = nfs3_forget_cached_acls, 820 .clear_acl_cache = nfs3_forget_cached_acls,
808}; 821};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82b..c910413eaeca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
451 /* Save the delegation */ 451 /* Save the delegation */
452 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); 452 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
453 rcu_read_unlock(); 453 rcu_read_unlock();
454 lock_kernel();
455 ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); 454 ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
456 unlock_kernel();
457 if (ret != 0) 455 if (ret != 0)
458 goto out; 456 goto out;
459 ret = -EAGAIN; 457 ret = -EAGAIN;
@@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
1139 return res; 1137 return res;
1140} 1138}
1141 1139
1142static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, 1140static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1143 struct iattr *sattr, struct nfs4_state *state) 1141 struct nfs_fattr *fattr, struct iattr *sattr,
1142 struct nfs4_state *state)
1144{ 1143{
1145 struct nfs_server *server = NFS_SERVER(inode); 1144 struct nfs_server *server = NFS_SERVER(inode);
1146 struct nfs_setattrargs arg = { 1145 struct nfs_setattrargs arg = {
@@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
1154 .server = server, 1153 .server = server,
1155 }; 1154 };
1156 struct rpc_message msg = { 1155 struct rpc_message msg = {
1157 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], 1156 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
1158 .rpc_argp = &arg, 1157 .rpc_argp = &arg,
1159 .rpc_resp = &res, 1158 .rpc_resp = &res,
1159 .rpc_cred = cred,
1160 }; 1160 };
1161 unsigned long timestamp = jiffies; 1161 unsigned long timestamp = jiffies;
1162 int status; 1162 int status;
@@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
1166 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { 1166 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
1167 /* Use that stateid */ 1167 /* Use that stateid */
1168 } else if (state != NULL) { 1168 } else if (state != NULL) {
1169 msg.rpc_cred = state->owner->so_cred;
1170 nfs4_copy_stateid(&arg.stateid, state, current->files); 1169 nfs4_copy_stateid(&arg.stateid, state, current->files);
1171 } else 1170 } else
1172 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1171 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
1177 return status; 1176 return status;
1178} 1177}
1179 1178
1180static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, 1179static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1181 struct iattr *sattr, struct nfs4_state *state) 1180 struct nfs_fattr *fattr, struct iattr *sattr,
1181 struct nfs4_state *state)
1182{ 1182{
1183 struct nfs_server *server = NFS_SERVER(inode); 1183 struct nfs_server *server = NFS_SERVER(inode);
1184 struct nfs4_exception exception = { }; 1184 struct nfs4_exception exception = { };
1185 int err; 1185 int err;
1186 do { 1186 do {
1187 err = nfs4_handle_exception(server, 1187 err = nfs4_handle_exception(server,
1188 _nfs4_do_setattr(inode, fattr, sattr, state), 1188 _nfs4_do_setattr(inode, cred, fattr, sattr, state),
1189 &exception); 1189 &exception);
1190 } while (exception.retry); 1190 } while (exception.retry);
1191 return err; 1191 return err;
@@ -1647,29 +1647,25 @@ static int
1647nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, 1647nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
1648 struct iattr *sattr) 1648 struct iattr *sattr)
1649{ 1649{
1650 struct rpc_cred *cred;
1651 struct inode *inode = dentry->d_inode; 1650 struct inode *inode = dentry->d_inode;
1652 struct nfs_open_context *ctx; 1651 struct rpc_cred *cred = NULL;
1653 struct nfs4_state *state = NULL; 1652 struct nfs4_state *state = NULL;
1654 int status; 1653 int status;
1655 1654
1656 nfs_fattr_init(fattr); 1655 nfs_fattr_init(fattr);
1657 1656
1658 cred = rpc_lookup_cred();
1659 if (IS_ERR(cred))
1660 return PTR_ERR(cred);
1661
1662 /* Search for an existing open(O_WRITE) file */ 1657 /* Search for an existing open(O_WRITE) file */
1663 ctx = nfs_find_open_context(inode, cred, FMODE_WRITE); 1658 if (sattr->ia_valid & ATTR_FILE) {
1664 if (ctx != NULL) 1659 struct nfs_open_context *ctx;
1660
1661 ctx = nfs_file_open_context(sattr->ia_file);
1662 cred = ctx->cred;
1665 state = ctx->state; 1663 state = ctx->state;
1664 }
1666 1665
1667 status = nfs4_do_setattr(inode, fattr, sattr, state); 1666 status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
1668 if (status == 0) 1667 if (status == 0)
1669 nfs_setattr_update_inode(inode, sattr); 1668 nfs_setattr_update_inode(inode, sattr);
1670 if (ctx != NULL)
1671 put_nfs_open_context(ctx);
1672 put_rpccred(cred);
1673 return status; 1669 return status;
1674} 1670}
1675 1671
@@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1897 goto out; 1893 goto out;
1898 } 1894 }
1899 state = nfs4_do_open(dir, &path, flags, sattr, cred); 1895 state = nfs4_do_open(dir, &path, flags, sattr, cred);
1900 put_rpccred(cred);
1901 d_drop(dentry); 1896 d_drop(dentry);
1902 if (IS_ERR(state)) { 1897 if (IS_ERR(state)) {
1903 status = PTR_ERR(state); 1898 status = PTR_ERR(state);
1904 goto out; 1899 goto out_putcred;
1905 } 1900 }
1906 d_add(dentry, igrab(state->inode)); 1901 d_add(dentry, igrab(state->inode));
1907 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1902 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1908 if (flags & O_EXCL) { 1903 if (flags & O_EXCL) {
1909 struct nfs_fattr fattr; 1904 struct nfs_fattr fattr;
1910 status = nfs4_do_setattr(state->inode, &fattr, sattr, state); 1905 status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
1911 if (status == 0) 1906 if (status == 0)
1912 nfs_setattr_update_inode(state->inode, sattr); 1907 nfs_setattr_update_inode(state->inode, sattr);
1913 nfs_post_op_update_inode(state->inode, &fattr); 1908 nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1916 status = nfs4_intent_set_file(nd, &path, state); 1911 status = nfs4_intent_set_file(nd, &path, state);
1917 else 1912 else
1918 nfs4_close_sync(&path, state, flags); 1913 nfs4_close_sync(&path, state, flags);
1914out_putcred:
1915 put_rpccred(cred);
1919out: 1916out:
1920 return status; 1917 return status;
1921} 1918}
@@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
2079 return err; 2076 return err;
2080} 2077}
2081 2078
2079struct nfs4_createdata {
2080 struct rpc_message msg;
2081 struct nfs4_create_arg arg;
2082 struct nfs4_create_res res;
2083 struct nfs_fh fh;
2084 struct nfs_fattr fattr;
2085 struct nfs_fattr dir_fattr;
2086};
2087
2088static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
2089 struct qstr *name, struct iattr *sattr, u32 ftype)
2090{
2091 struct nfs4_createdata *data;
2092
2093 data = kzalloc(sizeof(*data), GFP_KERNEL);
2094 if (data != NULL) {
2095 struct nfs_server *server = NFS_SERVER(dir);
2096
2097 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
2098 data->msg.rpc_argp = &data->arg;
2099 data->msg.rpc_resp = &data->res;
2100 data->arg.dir_fh = NFS_FH(dir);
2101 data->arg.server = server;
2102 data->arg.name = name;
2103 data->arg.attrs = sattr;
2104 data->arg.ftype = ftype;
2105 data->arg.bitmask = server->attr_bitmask;
2106 data->res.server = server;
2107 data->res.fh = &data->fh;
2108 data->res.fattr = &data->fattr;
2109 data->res.dir_fattr = &data->dir_fattr;
2110 nfs_fattr_init(data->res.fattr);
2111 nfs_fattr_init(data->res.dir_fattr);
2112 }
2113 return data;
2114}
2115
2116static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
2117{
2118 int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
2119 if (status == 0) {
2120 update_changeattr(dir, &data->res.dir_cinfo);
2121 nfs_post_op_update_inode(dir, data->res.dir_fattr);
2122 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
2123 }
2124 return status;
2125}
2126
2127static void nfs4_free_createdata(struct nfs4_createdata *data)
2128{
2129 kfree(data);
2130}
2131
2082static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, 2132static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
2083 struct page *page, unsigned int len, struct iattr *sattr) 2133 struct page *page, unsigned int len, struct iattr *sattr)
2084{ 2134{
2085 struct nfs_server *server = NFS_SERVER(dir); 2135 struct nfs4_createdata *data;
2086 struct nfs_fh fhandle; 2136 int status = -ENAMETOOLONG;
2087 struct nfs_fattr fattr, dir_fattr;
2088 struct nfs4_create_arg arg = {
2089 .dir_fh = NFS_FH(dir),
2090 .server = server,
2091 .name = &dentry->d_name,
2092 .attrs = sattr,
2093 .ftype = NF4LNK,
2094 .bitmask = server->attr_bitmask,
2095 };
2096 struct nfs4_create_res res = {
2097 .server = server,
2098 .fh = &fhandle,
2099 .fattr = &fattr,
2100 .dir_fattr = &dir_fattr,
2101 };
2102 struct rpc_message msg = {
2103 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
2104 .rpc_argp = &arg,
2105 .rpc_resp = &res,
2106 };
2107 int status;
2108 2137
2109 if (len > NFS4_MAXPATHLEN) 2138 if (len > NFS4_MAXPATHLEN)
2110 return -ENAMETOOLONG; 2139 goto out;
2111 2140
2112 arg.u.symlink.pages = &page; 2141 status = -ENOMEM;
2113 arg.u.symlink.len = len; 2142 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
2114 nfs_fattr_init(&fattr); 2143 if (data == NULL)
2115 nfs_fattr_init(&dir_fattr); 2144 goto out;
2145
2146 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
2147 data->arg.u.symlink.pages = &page;
2148 data->arg.u.symlink.len = len;
2116 2149
2117 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2150 status = nfs4_do_create(dir, dentry, data);
2118 if (!status) { 2151
2119 update_changeattr(dir, &res.dir_cinfo); 2152 nfs4_free_createdata(data);
2120 nfs_post_op_update_inode(dir, res.dir_fattr); 2153out:
2121 status = nfs_instantiate(dentry, &fhandle, &fattr);
2122 }
2123 return status; 2154 return status;
2124} 2155}
2125 2156
@@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
2140static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, 2171static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2141 struct iattr *sattr) 2172 struct iattr *sattr)
2142{ 2173{
2143 struct nfs_server *server = NFS_SERVER(dir); 2174 struct nfs4_createdata *data;
2144 struct nfs_fh fhandle; 2175 int status = -ENOMEM;
2145 struct nfs_fattr fattr, dir_fattr;
2146 struct nfs4_create_arg arg = {
2147 .dir_fh = NFS_FH(dir),
2148 .server = server,
2149 .name = &dentry->d_name,
2150 .attrs = sattr,
2151 .ftype = NF4DIR,
2152 .bitmask = server->attr_bitmask,
2153 };
2154 struct nfs4_create_res res = {
2155 .server = server,
2156 .fh = &fhandle,
2157 .fattr = &fattr,
2158 .dir_fattr = &dir_fattr,
2159 };
2160 struct rpc_message msg = {
2161 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
2162 .rpc_argp = &arg,
2163 .rpc_resp = &res,
2164 };
2165 int status;
2166 2176
2167 nfs_fattr_init(&fattr); 2177 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
2168 nfs_fattr_init(&dir_fattr); 2178 if (data == NULL)
2169 2179 goto out;
2170 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2180
2171 if (!status) { 2181 status = nfs4_do_create(dir, dentry, data);
2172 update_changeattr(dir, &res.dir_cinfo); 2182
2173 nfs_post_op_update_inode(dir, res.dir_fattr); 2183 nfs4_free_createdata(data);
2174 status = nfs_instantiate(dentry, &fhandle, &fattr); 2184out:
2175 }
2176 return status; 2185 return status;
2177} 2186}
2178 2187
@@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2242static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, 2251static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
2243 struct iattr *sattr, dev_t rdev) 2252 struct iattr *sattr, dev_t rdev)
2244{ 2253{
2245 struct nfs_server *server = NFS_SERVER(dir); 2254 struct nfs4_createdata *data;
2246 struct nfs_fh fh; 2255 int mode = sattr->ia_mode;
2247 struct nfs_fattr fattr, dir_fattr; 2256 int status = -ENOMEM;
2248 struct nfs4_create_arg arg = {
2249 .dir_fh = NFS_FH(dir),
2250 .server = server,
2251 .name = &dentry->d_name,
2252 .attrs = sattr,
2253 .bitmask = server->attr_bitmask,
2254 };
2255 struct nfs4_create_res res = {
2256 .server = server,
2257 .fh = &fh,
2258 .fattr = &fattr,
2259 .dir_fattr = &dir_fattr,
2260 };
2261 struct rpc_message msg = {
2262 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
2263 .rpc_argp = &arg,
2264 .rpc_resp = &res,
2265 };
2266 int status;
2267 int mode = sattr->ia_mode;
2268
2269 nfs_fattr_init(&fattr);
2270 nfs_fattr_init(&dir_fattr);
2271 2257
2272 BUG_ON(!(sattr->ia_valid & ATTR_MODE)); 2258 BUG_ON(!(sattr->ia_valid & ATTR_MODE));
2273 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); 2259 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
2260
2261 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
2262 if (data == NULL)
2263 goto out;
2264
2274 if (S_ISFIFO(mode)) 2265 if (S_ISFIFO(mode))
2275 arg.ftype = NF4FIFO; 2266 data->arg.ftype = NF4FIFO;
2276 else if (S_ISBLK(mode)) { 2267 else if (S_ISBLK(mode)) {
2277 arg.ftype = NF4BLK; 2268 data->arg.ftype = NF4BLK;
2278 arg.u.device.specdata1 = MAJOR(rdev); 2269 data->arg.u.device.specdata1 = MAJOR(rdev);
2279 arg.u.device.specdata2 = MINOR(rdev); 2270 data->arg.u.device.specdata2 = MINOR(rdev);
2280 } 2271 }
2281 else if (S_ISCHR(mode)) { 2272 else if (S_ISCHR(mode)) {
2282 arg.ftype = NF4CHR; 2273 data->arg.ftype = NF4CHR;
2283 arg.u.device.specdata1 = MAJOR(rdev); 2274 data->arg.u.device.specdata1 = MAJOR(rdev);
2284 arg.u.device.specdata2 = MINOR(rdev); 2275 data->arg.u.device.specdata2 = MINOR(rdev);
2285 } 2276 }
2286 else
2287 arg.ftype = NF4SOCK;
2288 2277
2289 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2278 status = nfs4_do_create(dir, dentry, data);
2290 if (status == 0) { 2279
2291 update_changeattr(dir, &res.dir_cinfo); 2280 nfs4_free_createdata(data);
2292 nfs_post_op_update_inode(dir, res.dir_fattr); 2281out:
2293 status = nfs_instantiate(dentry, &fh, &fattr);
2294 }
2295 return status; 2282 return status;
2296} 2283}
2297 2284
@@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
2706 ret = nfs_revalidate_inode(server, inode); 2693 ret = nfs_revalidate_inode(server, inode);
2707 if (ret < 0) 2694 if (ret < 0)
2708 return ret; 2695 return ret;
2696 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
2697 nfs_zap_acl_cache(inode);
2709 ret = nfs4_read_cached_acl(inode, buf, buflen); 2698 ret = nfs4_read_cached_acl(inode, buf, buflen);
2710 if (ret != -ENOENT) 2699 if (ret != -ENOENT)
2711 return ret; 2700 return ret;
@@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
2733 nfs_inode_return_delegation(inode); 2722 nfs_inode_return_delegation(inode);
2734 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 2723 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
2735 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 2724 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
2736 nfs_zap_caches(inode); 2725 nfs_access_zap_cache(inode);
2726 nfs_zap_acl_cache(inode);
2737 return ret; 2727 return ret;
2738} 2728}
2739 2729
@@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
2767 task->tk_status = 0; 2757 task->tk_status = 0;
2768 return -EAGAIN; 2758 return -EAGAIN;
2769 case -NFS4ERR_DELAY: 2759 case -NFS4ERR_DELAY:
2770 nfs_inc_server_stats((struct nfs_server *) server, 2760 nfs_inc_server_stats(server, NFSIOS_DELAY);
2771 NFSIOS_DELAY);
2772 case -NFS4ERR_GRACE: 2761 case -NFS4ERR_GRACE:
2773 rpc_delay(task, NFS4_POLL_RETRY_MAX); 2762 rpc_delay(task, NFS4_POLL_RETRY_MAX);
2774 task->tk_status = 0; 2763 task->tk_status = 0;
@@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
2933 2922
2934int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 2923int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
2935{ 2924{
2936 long timeout; 2925 long timeout = 0;
2937 int err; 2926 int err;
2938 do { 2927 do {
2939 err = _nfs4_proc_setclientid_confirm(clp, cred); 2928 err = _nfs4_proc_setclientid_confirm(clp, cred);
@@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
3725 .write_done = nfs4_write_done, 3714 .write_done = nfs4_write_done,
3726 .commit_setup = nfs4_proc_commit_setup, 3715 .commit_setup = nfs4_proc_commit_setup,
3727 .commit_done = nfs4_commit_done, 3716 .commit_done = nfs4_commit_done,
3728 .file_open = nfs_open,
3729 .file_release = nfs_release,
3730 .lock = nfs4_proc_lock, 3717 .lock = nfs4_proc_lock,
3731 .clear_acl_cache = nfs4_zap_acl_attr, 3718 .clear_acl_cache = nfs4_zap_acl_attr,
3732}; 3719};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f610..401ef8b28f97 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
940 allow_signal(SIGKILL); 940 allow_signal(SIGKILL);
941 941
942 /* Ensure exclusive access to NFSv4 state */ 942 /* Ensure exclusive access to NFSv4 state */
943 lock_kernel();
944 down_write(&clp->cl_sem); 943 down_write(&clp->cl_sem);
945 /* Are there any NFS mounts out there? */ 944 /* Are there any NFS mounts out there? */
946 if (list_empty(&clp->cl_superblocks)) 945 if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
1000 nfs_delegation_reap_unclaimed(clp); 999 nfs_delegation_reap_unclaimed(clp);
1001out: 1000out:
1002 up_write(&clp->cl_sem); 1001 up_write(&clp->cl_sem);
1003 unlock_kernel();
1004 if (status == -NFS4ERR_CB_PATH_DOWN) 1002 if (status == -NFS4ERR_CB_PATH_DOWN)
1005 nfs_handle_cb_pathdown(clp); 1003 nfs_handle_cb_pathdown(clp);
1006 nfs4_clear_recover_bit(clp); 1004 nfs4_clear_recover_bit(clp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d36823..46763d1cd397 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
3 *
4 * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de> 2 * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
5 * 3 *
6 * Allow an NFS filesystem to be mounted as root. The way this works is: 4 * Allow an NFS filesystem to be mounted as root. The way this works is:
@@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name)
297 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ 295 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
298 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; 296 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
299 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; 297 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
300 nfs_data.acregmin = 3; 298 nfs_data.acregmin = NFS_DEF_ACREGMIN;
301 nfs_data.acregmax = 60; 299 nfs_data.acregmax = NFS_DEF_ACREGMAX;
302 nfs_data.acdirmin = 30; 300 nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
303 nfs_data.acdirmax = 60; 301 nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
304 strcpy(buf, NFS_ROOT); 302 strcpy(buf, NFS_ROOT);
305 303
306 /* Process options received from the remote server */ 304 /* Process options received from the remote server */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81cf..4dbb84df1b68 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
129 sattr->ia_mode &= S_IALLUGO; 129 sattr->ia_mode &= S_IALLUGO;
130 130
131 dprintk("NFS call setattr\n"); 131 dprintk("NFS call setattr\n");
132 if (sattr->ia_valid & ATTR_FILE)
133 msg.rpc_cred = nfs_file_cred(sattr->ia_file);
132 nfs_fattr_init(fattr); 134 nfs_fattr_init(fattr);
133 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 135 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
134 if (status == 0) 136 if (status == 0)
@@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
598 return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); 600 return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
599} 601}
600 602
603/* Helper functions for NFS lock bounds checking */
604#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
605static int nfs_lock_check_bounds(const struct file_lock *fl)
606{
607 __s32 start, end;
608
609 start = (__s32)fl->fl_start;
610 if ((loff_t)start != fl->fl_start)
611 goto out_einval;
612
613 if (fl->fl_end != OFFSET_MAX) {
614 end = (__s32)fl->fl_end;
615 if ((loff_t)end != fl->fl_end)
616 goto out_einval;
617 } else
618 end = NFS_LOCK32_OFFSET_MAX;
619
620 if (start < 0 || start > end)
621 goto out_einval;
622 return 0;
623out_einval:
624 return -EINVAL;
625}
601 626
602const struct nfs_rpc_ops nfs_v2_clientops = { 627const struct nfs_rpc_ops nfs_v2_clientops = {
603 .version = 2, /* protocol version */ 628 .version = 2, /* protocol version */
@@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
630 .write_setup = nfs_proc_write_setup, 655 .write_setup = nfs_proc_write_setup,
631 .write_done = nfs_write_done, 656 .write_done = nfs_write_done,
632 .commit_setup = nfs_proc_commit_setup, 657 .commit_setup = nfs_proc_commit_setup,
633 .file_open = nfs_open,
634 .file_release = nfs_release,
635 .lock = nfs_proc_lock, 658 .lock = nfs_proc_lock,
659 .lock_check_bounds = nfs_lock_check_bounds,
636}; 660};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 614efeed5437..1b94e3650f5c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
47#include <linux/inet.h> 47#include <linux/inet.h>
48#include <linux/in6.h> 48#include <linux/in6.h>
49#include <net/ipv6.h> 49#include <net/ipv6.h>
50#include <linux/netdevice.h>
50#include <linux/nfs_xdr.h> 51#include <linux/nfs_xdr.h>
51#include <linux/magic.h> 52#include <linux/magic.h>
52#include <linux/parser.h> 53#include <linux/parser.h>
@@ -65,7 +66,6 @@
65enum { 66enum {
66 /* Mount options that take no arguments */ 67 /* Mount options that take no arguments */
67 Opt_soft, Opt_hard, 68 Opt_soft, Opt_hard,
68 Opt_intr, Opt_nointr,
69 Opt_posix, Opt_noposix, 69 Opt_posix, Opt_noposix,
70 Opt_cto, Opt_nocto, 70 Opt_cto, Opt_nocto,
71 Opt_ac, Opt_noac, 71 Opt_ac, Opt_noac,
@@ -92,8 +92,8 @@ enum {
92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
93 Opt_addr, Opt_mountaddr, Opt_clientaddr, 93 Opt_addr, Opt_mountaddr, Opt_clientaddr,
94 94
95 /* Mount options that are ignored */ 95 /* Special mount options */
96 Opt_userspace, Opt_deprecated, 96 Opt_userspace, Opt_deprecated, Opt_sloppy,
97 97
98 Opt_err 98 Opt_err
99}; 99};
@@ -101,10 +101,14 @@ enum {
101static match_table_t nfs_mount_option_tokens = { 101static match_table_t nfs_mount_option_tokens = {
102 { Opt_userspace, "bg" }, 102 { Opt_userspace, "bg" },
103 { Opt_userspace, "fg" }, 103 { Opt_userspace, "fg" },
104 { Opt_userspace, "retry=%s" },
105
106 { Opt_sloppy, "sloppy" },
107
104 { Opt_soft, "soft" }, 108 { Opt_soft, "soft" },
105 { Opt_hard, "hard" }, 109 { Opt_hard, "hard" },
106 { Opt_intr, "intr" }, 110 { Opt_deprecated, "intr" },
107 { Opt_nointr, "nointr" }, 111 { Opt_deprecated, "nointr" },
108 { Opt_posix, "posix" }, 112 { Opt_posix, "posix" },
109 { Opt_noposix, "noposix" }, 113 { Opt_noposix, "noposix" },
110 { Opt_cto, "cto" }, 114 { Opt_cto, "cto" },
@@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = {
136 { Opt_acdirmin, "acdirmin=%u" }, 140 { Opt_acdirmin, "acdirmin=%u" },
137 { Opt_acdirmax, "acdirmax=%u" }, 141 { Opt_acdirmax, "acdirmax=%u" },
138 { Opt_actimeo, "actimeo=%u" }, 142 { Opt_actimeo, "actimeo=%u" },
139 { Opt_userspace, "retry=%u" },
140 { Opt_namelen, "namlen=%u" }, 143 { Opt_namelen, "namlen=%u" },
141 { Opt_mountport, "mountport=%u" }, 144 { Opt_mountport, "mountport=%u" },
142 { Opt_mountvers, "mountvers=%u" }, 145 { Opt_mountvers, "mountvers=%u" },
@@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
207 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 210 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
208static void nfs_kill_super(struct super_block *); 211static void nfs_kill_super(struct super_block *);
209static void nfs_put_super(struct super_block *); 212static void nfs_put_super(struct super_block *);
213static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
210 214
211static struct file_system_type nfs_fs_type = { 215static struct file_system_type nfs_fs_type = {
212 .owner = THIS_MODULE, 216 .owner = THIS_MODULE,
@@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = {
234 .umount_begin = nfs_umount_begin, 238 .umount_begin = nfs_umount_begin,
235 .show_options = nfs_show_options, 239 .show_options = nfs_show_options,
236 .show_stats = nfs_show_stats, 240 .show_stats = nfs_show_stats,
241 .remount_fs = nfs_remount,
237}; 242};
238 243
239#ifdef CONFIG_NFS_V4 244#ifdef CONFIG_NFS_V4
@@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = {
278 .umount_begin = nfs_umount_begin, 283 .umount_begin = nfs_umount_begin,
279 .show_options = nfs_show_options, 284 .show_options = nfs_show_options,
280 .show_stats = nfs_show_stats, 285 .show_stats = nfs_show_stats,
286 .remount_fs = nfs_remount,
281}; 287};
282#endif 288#endif
283 289
@@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
368 }; 374 };
369 int error; 375 int error;
370 376
371 lock_kernel();
372
373 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 377 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
374 if (error < 0) 378 if (error < 0)
375 goto out_err; 379 goto out_err;
@@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
401 405
402 buf->f_namelen = server->namelen; 406 buf->f_namelen = server->namelen;
403 407
404 unlock_kernel();
405 return 0; 408 return 0;
406 409
407 out_err: 410 out_err:
408 dprintk("%s: statfs error = %d\n", __func__, -error); 411 dprintk("%s: statfs error = %d\n", __func__, -error);
409 unlock_kernel();
410 return error; 412 return error;
411} 413}
412 414
@@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
514 if (nfss->bsize != 0) 516 if (nfss->bsize != 0)
515 seq_printf(m, ",bsize=%u", nfss->bsize); 517 seq_printf(m, ",bsize=%u", nfss->bsize);
516 seq_printf(m, ",namlen=%u", nfss->namelen); 518 seq_printf(m, ",namlen=%u", nfss->namelen);
517 if (nfss->acregmin != 3*HZ || showdefaults) 519 if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
518 seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); 520 seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
519 if (nfss->acregmax != 60*HZ || showdefaults) 521 if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
520 seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); 522 seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
521 if (nfss->acdirmin != 30*HZ || showdefaults) 523 if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
522 seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); 524 seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
523 if (nfss->acdirmax != 60*HZ || showdefaults) 525 if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
524 seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); 526 seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
525 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { 527 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
526 if (nfss->flags & nfs_infop->flag) 528 if (nfss->flags & nfs_infop->flag)
@@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr)
702 return 0; 704 return 0;
703} 705}
704 706
707static void nfs_parse_ipv4_address(char *string, size_t str_len,
708 struct sockaddr *sap, size_t *addr_len)
709{
710 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
711 u8 *addr = (u8 *)&sin->sin_addr.s_addr;
712
713 if (str_len <= INET_ADDRSTRLEN) {
714 dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
715 (int)str_len, string);
716
717 sin->sin_family = AF_INET;
718 *addr_len = sizeof(*sin);
719 if (in4_pton(string, str_len, addr, '\0', NULL))
720 return;
721 }
722
723 sap->sa_family = AF_UNSPEC;
724 *addr_len = 0;
725}
726
727#define IPV6_SCOPE_DELIMITER '%'
728
729#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
730static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
731 const char *delim,
732 struct sockaddr_in6 *sin6)
733{
734 char *p;
735 size_t len;
736
737 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
738 return ;
739 if (*delim != IPV6_SCOPE_DELIMITER)
740 return;
741
742 len = (string + str_len) - delim - 1;
743 p = kstrndup(delim + 1, len, GFP_KERNEL);
744 if (p) {
745 unsigned long scope_id = 0;
746 struct net_device *dev;
747
748 dev = dev_get_by_name(&init_net, p);
749 if (dev != NULL) {
750 scope_id = dev->ifindex;
751 dev_put(dev);
752 } else {
753 /* scope_id is set to zero on error */
754 strict_strtoul(p, 10, &scope_id);
755 }
756
757 kfree(p);
758 sin6->sin6_scope_id = scope_id;
759 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
760 }
761}
762
763static void nfs_parse_ipv6_address(char *string, size_t str_len,
764 struct sockaddr *sap, size_t *addr_len)
765{
766 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
767 u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
768 const char *delim;
769
770 if (str_len <= INET6_ADDRSTRLEN) {
771 dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
772 (int)str_len, string);
773
774 sin6->sin6_family = AF_INET6;
775 *addr_len = sizeof(*sin6);
776 if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
777 nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
778 return;
779 }
780 }
781
782 sap->sa_family = AF_UNSPEC;
783 *addr_len = 0;
784}
785#else
786static void nfs_parse_ipv6_address(char *string, size_t str_len,
787 struct sockaddr *sap, size_t *addr_len)
788{
789 sap->sa_family = AF_UNSPEC;
790 *addr_len = 0;
791}
792#endif
793
705/* 794/*
706 * Parse string addresses passed in via a mount option, 795 * Construct a sockaddr based on the contents of a string that contains
707 * and construct a sockaddr based on the result. 796 * an IP address in presentation format.
708 * 797 *
709 * If address parsing fails, set the sockaddr's address 798 * If there is a problem constructing the new sockaddr, set the address
710 * family to AF_UNSPEC to force nfs_verify_server_address() 799 * family to AF_UNSPEC.
711 * to punt the mount.
712 */ 800 */
713static void nfs_parse_server_address(char *value, 801static void nfs_parse_ip_address(char *string, size_t str_len,
714 struct sockaddr *sap, 802 struct sockaddr *sap, size_t *addr_len)
715 size_t *len)
716{ 803{
717 if (strchr(value, ':')) { 804 unsigned int i, colons;
718 struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
719 u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
720 805
721 ap->sin6_family = AF_INET6; 806 colons = 0;
722 *len = sizeof(*ap); 807 for (i = 0; i < str_len; i++)
723 if (in6_pton(value, -1, addr, '\0', NULL)) 808 if (string[i] == ':')
724 return; 809 colons++;
725 } else { 810
726 struct sockaddr_in *ap = (struct sockaddr_in *)sap; 811 if (colons >= 2)
727 u8 *addr = (u8 *)&ap->sin_addr.s_addr; 812 nfs_parse_ipv6_address(string, str_len, sap, addr_len);
813 else
814 nfs_parse_ipv4_address(string, str_len, sap, addr_len);
815}
816
817/*
818 * Sanity check the NFS transport protocol.
819 *
820 */
821static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
822{
823 switch (mnt->nfs_server.protocol) {
824 case XPRT_TRANSPORT_UDP:
825 case XPRT_TRANSPORT_TCP:
826 case XPRT_TRANSPORT_RDMA:
827 break;
828 default:
829 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
830 }
831}
832
833/*
834 * For text based NFSv2/v3 mounts, the mount protocol transport default
835 * settings should depend upon the specified NFS transport.
836 */
837static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
838{
839 nfs_validate_transport_protocol(mnt);
728 840
729 ap->sin_family = AF_INET; 841 if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
730 *len = sizeof(*ap); 842 mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
731 if (in4_pton(value, -1, addr, '\0', NULL))
732 return; 843 return;
844 switch (mnt->nfs_server.protocol) {
845 case XPRT_TRANSPORT_UDP:
846 mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
847 break;
848 case XPRT_TRANSPORT_TCP:
849 case XPRT_TRANSPORT_RDMA:
850 mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
733 } 851 }
852}
734 853
735 sap->sa_family = AF_UNSPEC; 854/*
736 *len = 0; 855 * Parse the value of the 'sec=' option.
856 *
857 * The flavor_len setting is for v4 mounts.
858 */
859static int nfs_parse_security_flavors(char *value,
860 struct nfs_parsed_mount_data *mnt)
861{
862 substring_t args[MAX_OPT_ARGS];
863
864 dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
865
866 switch (match_token(value, nfs_secflavor_tokens, args)) {
867 case Opt_sec_none:
868 mnt->auth_flavor_len = 0;
869 mnt->auth_flavors[0] = RPC_AUTH_NULL;
870 break;
871 case Opt_sec_sys:
872 mnt->auth_flavor_len = 0;
873 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
874 break;
875 case Opt_sec_krb5:
876 mnt->auth_flavor_len = 1;
877 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
878 break;
879 case Opt_sec_krb5i:
880 mnt->auth_flavor_len = 1;
881 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
882 break;
883 case Opt_sec_krb5p:
884 mnt->auth_flavor_len = 1;
885 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
886 break;
887 case Opt_sec_lkey:
888 mnt->auth_flavor_len = 1;
889 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
890 break;
891 case Opt_sec_lkeyi:
892 mnt->auth_flavor_len = 1;
893 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
894 break;
895 case Opt_sec_lkeyp:
896 mnt->auth_flavor_len = 1;
897 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
898 break;
899 case Opt_sec_spkm:
900 mnt->auth_flavor_len = 1;
901 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
902 break;
903 case Opt_sec_spkmi:
904 mnt->auth_flavor_len = 1;
905 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
906 break;
907 case Opt_sec_spkmp:
908 mnt->auth_flavor_len = 1;
909 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
910 break;
911 default:
912 return 0;
913 }
914
915 return 1;
916}
917
918static void nfs_parse_invalid_value(const char *option)
919{
920 dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option);
737} 921}
738 922
739/* 923/*
740 * Error-check and convert a string of mount options from user space into 924 * Error-check and convert a string of mount options from user space into
741 * a data structure 925 * a data structure. The whole mount string is processed; bad options are
926 * skipped as they are encountered. If there were no errors, return 1;
927 * otherwise return 0 (zero).
742 */ 928 */
743static int nfs_parse_mount_options(char *raw, 929static int nfs_parse_mount_options(char *raw,
744 struct nfs_parsed_mount_data *mnt) 930 struct nfs_parsed_mount_data *mnt)
745{ 931{
746 char *p, *string, *secdata; 932 char *p, *string, *secdata;
747 int rc; 933 int rc, sloppy = 0, errors = 0;
748 934
749 if (!raw) { 935 if (!raw) {
750 dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); 936 dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw,
777 963
778 token = match_token(p, nfs_mount_option_tokens, args); 964 token = match_token(p, nfs_mount_option_tokens, args);
779 switch (token) { 965 switch (token) {
966
967 /*
968 * boolean options: foo/nofoo
969 */
780 case Opt_soft: 970 case Opt_soft:
781 mnt->flags |= NFS_MOUNT_SOFT; 971 mnt->flags |= NFS_MOUNT_SOFT;
782 break; 972 break;
783 case Opt_hard: 973 case Opt_hard:
784 mnt->flags &= ~NFS_MOUNT_SOFT; 974 mnt->flags &= ~NFS_MOUNT_SOFT;
785 break; 975 break;
786 case Opt_intr:
787 case Opt_nointr:
788 break;
789 case Opt_posix: 976 case Opt_posix:
790 mnt->flags |= NFS_MOUNT_POSIX; 977 mnt->flags |= NFS_MOUNT_POSIX;
791 break; 978 break;
@@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
819 case Opt_udp: 1006 case Opt_udp:
820 mnt->flags &= ~NFS_MOUNT_TCP; 1007 mnt->flags &= ~NFS_MOUNT_TCP;
821 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1008 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
822 mnt->timeo = 7;
823 mnt->retrans = 5;
824 break; 1009 break;
825 case Opt_tcp: 1010 case Opt_tcp:
826 mnt->flags |= NFS_MOUNT_TCP; 1011 mnt->flags |= NFS_MOUNT_TCP;
827 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1012 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
828 mnt->timeo = 600;
829 mnt->retrans = 2;
830 break; 1013 break;
831 case Opt_rdma: 1014 case Opt_rdma:
832 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ 1015 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
833 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1016 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
834 mnt->timeo = 600;
835 mnt->retrans = 2;
836 break; 1017 break;
837 case Opt_acl: 1018 case Opt_acl:
838 mnt->flags &= ~NFS_MOUNT_NOACL; 1019 mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw,
853 mnt->flags |= NFS_MOUNT_UNSHARED; 1034 mnt->flags |= NFS_MOUNT_UNSHARED;
854 break; 1035 break;
855 1036
1037 /*
1038 * options that take numeric values
1039 */
856 case Opt_port: 1040 case Opt_port:
857 if (match_int(args, &option)) 1041 if (match_int(args, &option) ||
858 return 0; 1042 option < 0 || option > USHORT_MAX) {
859 if (option < 0 || option > 65535) 1043 errors++;
860 return 0; 1044 nfs_parse_invalid_value("port");
861 mnt->nfs_server.port = option; 1045 } else
1046 mnt->nfs_server.port = option;
862 break; 1047 break;
863 case Opt_rsize: 1048 case Opt_rsize:
864 if (match_int(args, &mnt->rsize)) 1049 if (match_int(args, &option) || option < 0) {
865 return 0; 1050 errors++;
1051 nfs_parse_invalid_value("rsize");
1052 } else
1053 mnt->rsize = option;
866 break; 1054 break;
867 case Opt_wsize: 1055 case Opt_wsize:
868 if (match_int(args, &mnt->wsize)) 1056 if (match_int(args, &option) || option < 0) {
869 return 0; 1057 errors++;
1058 nfs_parse_invalid_value("wsize");
1059 } else
1060 mnt->wsize = option;
870 break; 1061 break;
871 case Opt_bsize: 1062 case Opt_bsize:
872 if (match_int(args, &option)) 1063 if (match_int(args, &option) || option < 0) {
873 return 0; 1064 errors++;
874 if (option < 0) 1065 nfs_parse_invalid_value("bsize");
875 return 0; 1066 } else
876 mnt->bsize = option; 1067 mnt->bsize = option;
877 break; 1068 break;
878 case Opt_timeo: 1069 case Opt_timeo:
879 if (match_int(args, &mnt->timeo)) 1070 if (match_int(args, &option) || option <= 0) {
880 return 0; 1071 errors++;
1072 nfs_parse_invalid_value("timeo");
1073 } else
1074 mnt->timeo = option;
881 break; 1075 break;
882 case Opt_retrans: 1076 case Opt_retrans:
883 if (match_int(args, &mnt->retrans)) 1077 if (match_int(args, &option) || option <= 0) {
884 return 0; 1078 errors++;
1079 nfs_parse_invalid_value("retrans");
1080 } else
1081 mnt->retrans = option;
885 break; 1082 break;
886 case Opt_acregmin: 1083 case Opt_acregmin:
887 if (match_int(args, &mnt->acregmin)) 1084 if (match_int(args, &option) || option < 0) {
888 return 0; 1085 errors++;
1086 nfs_parse_invalid_value("acregmin");
1087 } else
1088 mnt->acregmin = option;
889 break; 1089 break;
890 case Opt_acregmax: 1090 case Opt_acregmax:
891 if (match_int(args, &mnt->acregmax)) 1091 if (match_int(args, &option) || option < 0) {
892 return 0; 1092 errors++;
1093 nfs_parse_invalid_value("acregmax");
1094 } else
1095 mnt->acregmax = option;
893 break; 1096 break;
894 case Opt_acdirmin: 1097 case Opt_acdirmin:
895 if (match_int(args, &mnt->acdirmin)) 1098 if (match_int(args, &option) || option < 0) {
896 return 0; 1099 errors++;
1100 nfs_parse_invalid_value("acdirmin");
1101 } else
1102 mnt->acdirmin = option;
897 break; 1103 break;
898 case Opt_acdirmax: 1104 case Opt_acdirmax:
899 if (match_int(args, &mnt->acdirmax)) 1105 if (match_int(args, &option) || option < 0) {
900 return 0; 1106 errors++;
1107 nfs_parse_invalid_value("acdirmax");
1108 } else
1109 mnt->acdirmax = option;
901 break; 1110 break;
902 case Opt_actimeo: 1111 case Opt_actimeo:
903 if (match_int(args, &option)) 1112 if (match_int(args, &option) || option < 0) {
904 return 0; 1113 errors++;
905 if (option < 0) 1114 nfs_parse_invalid_value("actimeo");
906 return 0; 1115 } else
907 mnt->acregmin = 1116 mnt->acregmin = mnt->acregmax =
908 mnt->acregmax = 1117 mnt->acdirmin = mnt->acdirmax = option;
909 mnt->acdirmin =
910 mnt->acdirmax = option;
911 break; 1118 break;
912 case Opt_namelen: 1119 case Opt_namelen:
913 if (match_int(args, &mnt->namlen)) 1120 if (match_int(args, &option) || option < 0) {
914 return 0; 1121 errors++;
1122 nfs_parse_invalid_value("namlen");
1123 } else
1124 mnt->namlen = option;
915 break; 1125 break;
916 case Opt_mountport: 1126 case Opt_mountport:
917 if (match_int(args, &option)) 1127 if (match_int(args, &option) ||
918 return 0; 1128 option < 0 || option > USHORT_MAX) {
919 if (option < 0 || option > 65535) 1129 errors++;
920 return 0; 1130 nfs_parse_invalid_value("mountport");
921 mnt->mount_server.port = option; 1131 } else
1132 mnt->mount_server.port = option;
922 break; 1133 break;
923 case Opt_mountvers: 1134 case Opt_mountvers:
924 if (match_int(args, &option)) 1135 if (match_int(args, &option) ||
925 return 0; 1136 option < NFS_MNT_VERSION ||
926 if (option < 0) 1137 option > NFS_MNT3_VERSION) {
927 return 0; 1138 errors++;
928 mnt->mount_server.version = option; 1139 nfs_parse_invalid_value("mountvers");
1140 } else
1141 mnt->mount_server.version = option;
929 break; 1142 break;
930 case Opt_nfsvers: 1143 case Opt_nfsvers:
931 if (match_int(args, &option)) 1144 if (match_int(args, &option)) {
932 return 0; 1145 errors++;
1146 nfs_parse_invalid_value("nfsvers");
1147 break;
1148 }
933 switch (option) { 1149 switch (option) {
934 case 2: 1150 case NFS2_VERSION:
935 mnt->flags &= ~NFS_MOUNT_VER3; 1151 mnt->flags &= ~NFS_MOUNT_VER3;
936 break; 1152 break;
937 case 3: 1153 case NFS3_VERSION:
938 mnt->flags |= NFS_MOUNT_VER3; 1154 mnt->flags |= NFS_MOUNT_VER3;
939 break; 1155 break;
940 default: 1156 default:
941 goto out_unrec_vers; 1157 errors++;
1158 nfs_parse_invalid_value("nfsvers");
942 } 1159 }
943 break; 1160 break;
944 1161
1162 /*
1163 * options that take text values
1164 */
945 case Opt_sec: 1165 case Opt_sec:
946 string = match_strdup(args); 1166 string = match_strdup(args);
947 if (string == NULL) 1167 if (string == NULL)
948 goto out_nomem; 1168 goto out_nomem;
949 token = match_token(string, nfs_secflavor_tokens, args); 1169 rc = nfs_parse_security_flavors(string, mnt);
950 kfree(string); 1170 kfree(string);
951 1171 if (!rc) {
952 /* 1172 errors++;
953 * The flags setting is for v2/v3. The flavor_len 1173 dfprintk(MOUNT, "NFS: unrecognized "
954 * setting is for v4. v2/v3 also need to know the 1174 "security flavor\n");
955 * difference between NULL and UNIX.
956 */
957 switch (token) {
958 case Opt_sec_none:
959 mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
960 mnt->auth_flavor_len = 0;
961 mnt->auth_flavors[0] = RPC_AUTH_NULL;
962 break;
963 case Opt_sec_sys:
964 mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
965 mnt->auth_flavor_len = 0;
966 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
967 break;
968 case Opt_sec_krb5:
969 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
970 mnt->auth_flavor_len = 1;
971 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
972 break;
973 case Opt_sec_krb5i:
974 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
975 mnt->auth_flavor_len = 1;
976 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
977 break;
978 case Opt_sec_krb5p:
979 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
980 mnt->auth_flavor_len = 1;
981 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
982 break;
983 case Opt_sec_lkey:
984 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
985 mnt->auth_flavor_len = 1;
986 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
987 break;
988 case Opt_sec_lkeyi:
989 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
990 mnt->auth_flavor_len = 1;
991 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
992 break;
993 case Opt_sec_lkeyp:
994 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
995 mnt->auth_flavor_len = 1;
996 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
997 break;
998 case Opt_sec_spkm:
999 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
1000 mnt->auth_flavor_len = 1;
1001 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
1002 break;
1003 case Opt_sec_spkmi:
1004 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
1005 mnt->auth_flavor_len = 1;
1006 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
1007 break;
1008 case Opt_sec_spkmp:
1009 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
1010 mnt->auth_flavor_len = 1;
1011 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
1012 break;
1013 default:
1014 goto out_unrec_sec;
1015 } 1175 }
1016 break; 1176 break;
1017 case Opt_proto: 1177 case Opt_proto:
@@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw,
1026 case Opt_xprt_udp: 1186 case Opt_xprt_udp:
1027 mnt->flags &= ~NFS_MOUNT_TCP; 1187 mnt->flags &= ~NFS_MOUNT_TCP;
1028 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1188 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1029 mnt->timeo = 7;
1030 mnt->retrans = 5;
1031 break; 1189 break;
1032 case Opt_xprt_tcp: 1190 case Opt_xprt_tcp:
1033 mnt->flags |= NFS_MOUNT_TCP; 1191 mnt->flags |= NFS_MOUNT_TCP;
1034 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1192 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1035 mnt->timeo = 600;
1036 mnt->retrans = 2;
1037 break; 1193 break;
1038 case Opt_xprt_rdma: 1194 case Opt_xprt_rdma:
1039 /* vector side protocols to TCP */ 1195 /* vector side protocols to TCP */
1040 mnt->flags |= NFS_MOUNT_TCP; 1196 mnt->flags |= NFS_MOUNT_TCP;
1041 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1197 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1042 mnt->timeo = 600;
1043 mnt->retrans = 2;
1044 break; 1198 break;
1045 default: 1199 default:
1046 goto out_unrec_xprt; 1200 errors++;
1201 dfprintk(MOUNT, "NFS: unrecognized "
1202 "transport protocol\n");
1047 } 1203 }
1048 break; 1204 break;
1049 case Opt_mountproto: 1205 case Opt_mountproto:
@@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw,
1063 break; 1219 break;
1064 case Opt_xprt_rdma: /* not used for side protocols */ 1220 case Opt_xprt_rdma: /* not used for side protocols */
1065 default: 1221 default:
1066 goto out_unrec_xprt; 1222 errors++;
1223 dfprintk(MOUNT, "NFS: unrecognized "
1224 "transport protocol\n");
1067 } 1225 }
1068 break; 1226 break;
1069 case Opt_addr: 1227 case Opt_addr:
1070 string = match_strdup(args); 1228 string = match_strdup(args);
1071 if (string == NULL) 1229 if (string == NULL)
1072 goto out_nomem; 1230 goto out_nomem;
1073 nfs_parse_server_address(string, (struct sockaddr *) 1231 nfs_parse_ip_address(string, strlen(string),
1074 &mnt->nfs_server.address, 1232 (struct sockaddr *)
1075 &mnt->nfs_server.addrlen); 1233 &mnt->nfs_server.address,
1234 &mnt->nfs_server.addrlen);
1076 kfree(string); 1235 kfree(string);
1077 break; 1236 break;
1078 case Opt_clientaddr: 1237 case Opt_clientaddr:
@@ -1093,24 +1252,33 @@ static int nfs_parse_mount_options(char *raw,
1093 string = match_strdup(args); 1252 string = match_strdup(args);
1094 if (string == NULL) 1253 if (string == NULL)
1095 goto out_nomem; 1254 goto out_nomem;
1096 nfs_parse_server_address(string, (struct sockaddr *) 1255 nfs_parse_ip_address(string, strlen(string),
1097 &mnt->mount_server.address, 1256 (struct sockaddr *)
1098 &mnt->mount_server.addrlen); 1257 &mnt->mount_server.address,
1258 &mnt->mount_server.addrlen);
1099 kfree(string); 1259 kfree(string);
1100 break; 1260 break;
1101 1261
1262 /*
1263 * Special options
1264 */
1265 case Opt_sloppy:
1266 sloppy = 1;
1267 dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
1268 break;
1102 case Opt_userspace: 1269 case Opt_userspace:
1103 case Opt_deprecated: 1270 case Opt_deprecated:
1271 dfprintk(MOUNT, "NFS: ignoring mount option "
1272 "'%s'\n", p);
1104 break; 1273 break;
1105 1274
1106 default: 1275 default:
1107 goto out_unknown; 1276 errors++;
1277 dfprintk(MOUNT, "NFS: unrecognized mount option "
1278 "'%s'\n", p);
1108 } 1279 }
1109 } 1280 }
1110 1281
1111 nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
1112 mnt->nfs_server.port);
1113
1114 return 1; 1282 return 1;
1115 1283
1116out_nomem: 1284out_nomem:
@@ -1120,21 +1288,6 @@ out_security_failure:
1120 free_secdata(secdata); 1288 free_secdata(secdata);
1121 printk(KERN_INFO "NFS: security options invalid: %d\n", rc); 1289 printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
1122 return 0; 1290 return 0;
1123out_unrec_vers:
1124 printk(KERN_INFO "NFS: unrecognized NFS version number\n");
1125 return 0;
1126
1127out_unrec_xprt:
1128 printk(KERN_INFO "NFS: unrecognized transport protocol\n");
1129 return 0;
1130
1131out_unrec_sec:
1132 printk(KERN_INFO "NFS: unrecognized security flavor\n");
1133 return 0;
1134
1135out_unknown:
1136 printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
1137 return 0;
1138} 1291}
1139 1292
1140/* 1293/*
@@ -1188,11 +1341,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1188 if (status == 0) 1341 if (status == 0)
1189 return 0; 1342 return 0;
1190 1343
1191 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d", 1344 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
1192 hostname, status); 1345 hostname, status);
1193 return status; 1346 return status;
1194} 1347}
1195 1348
1349static int nfs_parse_simple_hostname(const char *dev_name,
1350 char **hostname, size_t maxnamlen,
1351 char **export_path, size_t maxpathlen)
1352{
1353 size_t len;
1354 char *colon, *comma;
1355
1356 colon = strchr(dev_name, ':');
1357 if (colon == NULL)
1358 goto out_bad_devname;
1359
1360 len = colon - dev_name;
1361 if (len > maxnamlen)
1362 goto out_hostname;
1363
1364 /* N.B. caller will free nfs_server.hostname in all cases */
1365 *hostname = kstrndup(dev_name, len, GFP_KERNEL);
1366 if (!*hostname)
1367 goto out_nomem;
1368
1369 /* kill possible hostname list: not supported */
1370 comma = strchr(*hostname, ',');
1371 if (comma != NULL) {
1372 if (comma == *hostname)
1373 goto out_bad_devname;
1374 *comma = '\0';
1375 }
1376
1377 colon++;
1378 len = strlen(colon);
1379 if (len > maxpathlen)
1380 goto out_path;
1381 *export_path = kstrndup(colon, len, GFP_KERNEL);
1382 if (!*export_path)
1383 goto out_nomem;
1384
1385 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1386 return 0;
1387
1388out_bad_devname:
1389 dfprintk(MOUNT, "NFS: device name not in host:path format\n");
1390 return -EINVAL;
1391
1392out_nomem:
1393 dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
1394 return -ENOMEM;
1395
1396out_hostname:
1397 dfprintk(MOUNT, "NFS: server hostname too long\n");
1398 return -ENAMETOOLONG;
1399
1400out_path:
1401 dfprintk(MOUNT, "NFS: export pathname too long\n");
1402 return -ENAMETOOLONG;
1403}
1404
1405/*
1406 * Hostname has square brackets around it because it contains one or
1407 * more colons. We look for the first closing square bracket, and a
1408 * colon must follow it.
1409 */
1410static int nfs_parse_protected_hostname(const char *dev_name,
1411 char **hostname, size_t maxnamlen,
1412 char **export_path, size_t maxpathlen)
1413{
1414 size_t len;
1415 char *start, *end;
1416
1417 start = (char *)(dev_name + 1);
1418
1419 end = strchr(start, ']');
1420 if (end == NULL)
1421 goto out_bad_devname;
1422 if (*(end + 1) != ':')
1423 goto out_bad_devname;
1424
1425 len = end - start;
1426 if (len > maxnamlen)
1427 goto out_hostname;
1428
1429 /* N.B. caller will free nfs_server.hostname in all cases */
1430 *hostname = kstrndup(start, len, GFP_KERNEL);
1431 if (*hostname == NULL)
1432 goto out_nomem;
1433
1434 end += 2;
1435 len = strlen(end);
1436 if (len > maxpathlen)
1437 goto out_path;
1438 *export_path = kstrndup(end, len, GFP_KERNEL);
1439 if (!*export_path)
1440 goto out_nomem;
1441
1442 return 0;
1443
1444out_bad_devname:
1445 dfprintk(MOUNT, "NFS: device name not in host:path format\n");
1446 return -EINVAL;
1447
1448out_nomem:
1449 dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
1450 return -ENOMEM;
1451
1452out_hostname:
1453 dfprintk(MOUNT, "NFS: server hostname too long\n");
1454 return -ENAMETOOLONG;
1455
1456out_path:
1457 dfprintk(MOUNT, "NFS: export pathname too long\n");
1458 return -ENAMETOOLONG;
1459}
1460
1461/*
1462 * Split "dev_name" into "hostname:export_path".
1463 *
1464 * The leftmost colon demarks the split between the server's hostname
1465 * and the export path. If the hostname starts with a left square
1466 * bracket, then it may contain colons.
1467 *
1468 * Note: caller frees hostname and export path, even on error.
1469 */
1470static int nfs_parse_devname(const char *dev_name,
1471 char **hostname, size_t maxnamlen,
1472 char **export_path, size_t maxpathlen)
1473{
1474 if (*dev_name == '[')
1475 return nfs_parse_protected_hostname(dev_name,
1476 hostname, maxnamlen,
1477 export_path, maxpathlen);
1478
1479 return nfs_parse_simple_hostname(dev_name,
1480 hostname, maxnamlen,
1481 export_path, maxpathlen);
1482}
1483
1196/* 1484/*
1197 * Validate the NFS2/NFS3 mount data 1485 * Validate the NFS2/NFS3 mount data
1198 * - fills in the mount root filehandle 1486 * - fills in the mount root filehandle
@@ -1222,16 +1510,14 @@ static int nfs_validate_mount_data(void *options,
1222 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); 1510 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
1223 args->rsize = NFS_MAX_FILE_IO_SIZE; 1511 args->rsize = NFS_MAX_FILE_IO_SIZE;
1224 args->wsize = NFS_MAX_FILE_IO_SIZE; 1512 args->wsize = NFS_MAX_FILE_IO_SIZE;
1225 args->timeo = 600; 1513 args->acregmin = NFS_DEF_ACREGMIN;
1226 args->retrans = 2; 1514 args->acregmax = NFS_DEF_ACREGMAX;
1227 args->acregmin = 3; 1515 args->acdirmin = NFS_DEF_ACDIRMIN;
1228 args->acregmax = 60; 1516 args->acdirmax = NFS_DEF_ACDIRMAX;
1229 args->acdirmin = 30;
1230 args->acdirmax = 60;
1231 args->mount_server.port = 0; /* autobind unless user sets port */ 1517 args->mount_server.port = 0; /* autobind unless user sets port */
1232 args->mount_server.protocol = XPRT_TRANSPORT_UDP;
1233 args->nfs_server.port = 0; /* autobind unless user sets port */ 1518 args->nfs_server.port = 0; /* autobind unless user sets port */
1234 args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1519 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1520 args->auth_flavors[0] = RPC_AUTH_UNIX;
1235 1521
1236 switch (data->version) { 1522 switch (data->version) {
1237 case 1: 1523 case 1:
@@ -1289,7 +1575,9 @@ static int nfs_validate_mount_data(void *options,
1289 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); 1575 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
1290 args->namlen = data->namlen; 1576 args->namlen = data->namlen;
1291 args->bsize = data->bsize; 1577 args->bsize = data->bsize;
1292 args->auth_flavors[0] = data->pseudoflavor; 1578
1579 if (data->flags & NFS_MOUNT_SECFLAVOUR)
1580 args->auth_flavors[0] = data->pseudoflavor;
1293 if (!args->nfs_server.hostname) 1581 if (!args->nfs_server.hostname)
1294 goto out_nomem; 1582 goto out_nomem;
1295 1583
@@ -1321,8 +1609,6 @@ static int nfs_validate_mount_data(void *options,
1321 1609
1322 break; 1610 break;
1323 default: { 1611 default: {
1324 unsigned int len;
1325 char *c;
1326 int status; 1612 int status;
1327 1613
1328 if (nfs_parse_mount_options((char *)options, args) == 0) 1614 if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1332,21 +1618,22 @@ static int nfs_validate_mount_data(void *options,
1332 &args->nfs_server.address)) 1618 &args->nfs_server.address))
1333 goto out_no_address; 1619 goto out_no_address;
1334 1620
1335 c = strchr(dev_name, ':'); 1621 nfs_set_port((struct sockaddr *)&args->nfs_server.address,
1336 if (c == NULL) 1622 args->nfs_server.port);
1337 return -EINVAL;
1338 len = c - dev_name;
1339 /* N.B. caller will free nfs_server.hostname in all cases */
1340 args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
1341 if (!args->nfs_server.hostname)
1342 goto out_nomem;
1343 1623
1344 c++; 1624 nfs_set_mount_transport_protocol(args);
1345 if (strlen(c) > NFS_MAXPATHLEN) 1625
1346 return -ENAMETOOLONG; 1626 status = nfs_parse_devname(dev_name,
1347 args->nfs_server.export_path = c; 1627 &args->nfs_server.hostname,
1628 PAGE_SIZE,
1629 &args->nfs_server.export_path,
1630 NFS_MAXPATHLEN);
1631 if (!status)
1632 status = nfs_try_mount(args, mntfh);
1633
1634 kfree(args->nfs_server.export_path);
1635 args->nfs_server.export_path = NULL;
1348 1636
1349 status = nfs_try_mount(args, mntfh);
1350 if (status) 1637 if (status)
1351 return status; 1638 return status;
1352 1639
@@ -1354,9 +1641,6 @@ static int nfs_validate_mount_data(void *options,
1354 } 1641 }
1355 } 1642 }
1356 1643
1357 if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
1358 args->auth_flavors[0] = RPC_AUTH_UNIX;
1359
1360#ifndef CONFIG_NFS_V3 1644#ifndef CONFIG_NFS_V3
1361 if (args->flags & NFS_MOUNT_VER3) 1645 if (args->flags & NFS_MOUNT_VER3)
1362 goto out_v3_not_compiled; 1646 goto out_v3_not_compiled;
@@ -1396,6 +1680,80 @@ out_invalid_fh:
1396 return -EINVAL; 1680 return -EINVAL;
1397} 1681}
1398 1682
1683static int
1684nfs_compare_remount_data(struct nfs_server *nfss,
1685 struct nfs_parsed_mount_data *data)
1686{
1687 if (data->flags != nfss->flags ||
1688 data->rsize != nfss->rsize ||
1689 data->wsize != nfss->wsize ||
1690 data->retrans != nfss->client->cl_timeout->to_retries ||
1691 data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
1692 data->acregmin != nfss->acregmin / HZ ||
1693 data->acregmax != nfss->acregmax / HZ ||
1694 data->acdirmin != nfss->acdirmin / HZ ||
1695 data->acdirmax != nfss->acdirmax / HZ ||
1696 data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
1697 data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
1698 memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
1699 data->nfs_server.addrlen) != 0)
1700 return -EINVAL;
1701
1702 return 0;
1703}
1704
1705static int
1706nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1707{
1708 int error;
1709 struct nfs_server *nfss = sb->s_fs_info;
1710 struct nfs_parsed_mount_data *data;
1711 struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
1712 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
1713 u32 nfsvers = nfss->nfs_client->rpc_ops->version;
1714
1715 /*
1716 * Userspace mount programs that send binary options generally send
1717 * them populated with default values. We have no way to know which
1718 * ones were explicitly specified. Fall back to legacy behavior and
1719 * just return success.
1720 */
1721 if ((nfsvers == 4 && options4->version == 1) ||
1722 (nfsvers <= 3 && options->version >= 1 &&
1723 options->version <= 6))
1724 return 0;
1725
1726 data = kzalloc(sizeof(*data), GFP_KERNEL);
1727 if (data == NULL)
1728 return -ENOMEM;
1729
1730 /* fill out struct with values from existing mount */
1731 data->flags = nfss->flags;
1732 data->rsize = nfss->rsize;
1733 data->wsize = nfss->wsize;
1734 data->retrans = nfss->client->cl_timeout->to_retries;
1735 data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
1736 data->acregmin = nfss->acregmin / HZ;
1737 data->acregmax = nfss->acregmax / HZ;
1738 data->acdirmin = nfss->acdirmin / HZ;
1739 data->acdirmax = nfss->acdirmax / HZ;
1740 data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
1741 data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
1742 memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
1743 data->nfs_server.addrlen);
1744
1745 /* overwrite those values with any that were specified */
1746 error = nfs_parse_mount_options((char *)options, data);
1747 if (error < 0)
1748 goto out;
1749
1750 /* compare new mount options with old ones */
1751 error = nfs_compare_remount_data(nfss, data);
1752out:
1753 kfree(data);
1754 return error;
1755}
1756
1399/* 1757/*
1400 * Initialise the common bits of the superblock 1758 * Initialise the common bits of the superblock
1401 */ 1759 */
@@ -1811,14 +2169,13 @@ static int nfs4_validate_mount_data(void *options,
1811 2169
1812 args->rsize = NFS_MAX_FILE_IO_SIZE; 2170 args->rsize = NFS_MAX_FILE_IO_SIZE;
1813 args->wsize = NFS_MAX_FILE_IO_SIZE; 2171 args->wsize = NFS_MAX_FILE_IO_SIZE;
1814 args->timeo = 600; 2172 args->acregmin = NFS_DEF_ACREGMIN;
1815 args->retrans = 2; 2173 args->acregmax = NFS_DEF_ACREGMAX;
1816 args->acregmin = 3; 2174 args->acdirmin = NFS_DEF_ACDIRMIN;
1817 args->acregmax = 60; 2175 args->acdirmax = NFS_DEF_ACDIRMAX;
1818 args->acdirmin = 30;
1819 args->acdirmax = 60;
1820 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ 2176 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
1821 args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 2177 args->auth_flavors[0] = RPC_AUTH_UNIX;
2178 args->auth_flavor_len = 0;
1822 2179
1823 switch (data->version) { 2180 switch (data->version) {
1824 case 1: 2181 case 1:
@@ -1834,18 +2191,13 @@ static int nfs4_validate_mount_data(void *options,
1834 &args->nfs_server.address)) 2191 &args->nfs_server.address))
1835 goto out_no_address; 2192 goto out_no_address;
1836 2193
1837 switch (data->auth_flavourlen) { 2194 if (data->auth_flavourlen) {
1838 case 0: 2195 if (data->auth_flavourlen > 1)
1839 args->auth_flavors[0] = RPC_AUTH_UNIX; 2196 goto out_inval_auth;
1840 break;
1841 case 1:
1842 if (copy_from_user(&args->auth_flavors[0], 2197 if (copy_from_user(&args->auth_flavors[0],
1843 data->auth_flavours, 2198 data->auth_flavours,
1844 sizeof(args->auth_flavors[0]))) 2199 sizeof(args->auth_flavors[0])))
1845 return -EFAULT; 2200 return -EFAULT;
1846 break;
1847 default:
1848 goto out_inval_auth;
1849 } 2201 }
1850 2202
1851 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); 2203 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -1879,10 +2231,11 @@ static int nfs4_validate_mount_data(void *options,
1879 args->acdirmin = data->acdirmin; 2231 args->acdirmin = data->acdirmin;
1880 args->acdirmax = data->acdirmax; 2232 args->acdirmax = data->acdirmax;
1881 args->nfs_server.protocol = data->proto; 2233 args->nfs_server.protocol = data->proto;
2234 nfs_validate_transport_protocol(args);
1882 2235
1883 break; 2236 break;
1884 default: { 2237 default: {
1885 unsigned int len; 2238 int status;
1886 2239
1887 if (nfs_parse_mount_options((char *)options, args) == 0) 2240 if (nfs_parse_mount_options((char *)options, args) == 0)
1888 return -EINVAL; 2241 return -EINVAL;
@@ -1891,44 +2244,25 @@ static int nfs4_validate_mount_data(void *options,
1891 &args->nfs_server.address)) 2244 &args->nfs_server.address))
1892 return -EINVAL; 2245 return -EINVAL;
1893 2246
1894 switch (args->auth_flavor_len) { 2247 nfs_set_port((struct sockaddr *)&args->nfs_server.address,
1895 case 0: 2248 args->nfs_server.port);
1896 args->auth_flavors[0] = RPC_AUTH_UNIX;
1897 break;
1898 case 1:
1899 break;
1900 default:
1901 goto out_inval_auth;
1902 }
1903 2249
1904 /* 2250 nfs_validate_transport_protocol(args);
1905 * Split "dev_name" into "hostname:mntpath".
1906 */
1907 c = strchr(dev_name, ':');
1908 if (c == NULL)
1909 return -EINVAL;
1910 /* while calculating len, pretend ':' is '\0' */
1911 len = c - dev_name;
1912 if (len > NFS4_MAXNAMLEN)
1913 return -ENAMETOOLONG;
1914 /* N.B. caller will free nfs_server.hostname in all cases */
1915 args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
1916 if (!args->nfs_server.hostname)
1917 goto out_nomem;
1918
1919 c++; /* step over the ':' */
1920 len = strlen(c);
1921 if (len > NFS4_MAXPATHLEN)
1922 return -ENAMETOOLONG;
1923 args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
1924 if (!args->nfs_server.export_path)
1925 goto out_nomem;
1926 2251
1927 dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); 2252 if (args->auth_flavor_len > 1)
2253 goto out_inval_auth;
1928 2254
1929 if (args->client_address == NULL) 2255 if (args->client_address == NULL)
1930 goto out_no_client_address; 2256 goto out_no_client_address;
1931 2257
2258 status = nfs_parse_devname(dev_name,
2259 &args->nfs_server.hostname,
2260 NFS4_MAXNAMLEN,
2261 &args->nfs_server.export_path,
2262 NFS4_MAXPATHLEN);
2263 if (status < 0)
2264 return status;
2265
1932 break; 2266 break;
1933 } 2267 }
1934 } 2268 }
@@ -1944,10 +2278,6 @@ out_inval_auth:
1944 data->auth_flavourlen); 2278 data->auth_flavourlen);
1945 return -EINVAL; 2279 return -EINVAL;
1946 2280
1947out_nomem:
1948 dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
1949 return -ENOMEM;
1950
1951out_no_address: 2281out_no_address:
1952 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); 2282 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
1953 return -EINVAL; 2283 return -EINVAL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f333848fd3be..3229e217c773 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
34/* 34/*
35 * Local function declarations 35 * Local function declarations
36 */ 36 */
37static struct nfs_page * nfs_update_request(struct nfs_open_context*,
38 struct page *,
39 unsigned int, unsigned int);
40static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, 37static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
41 struct inode *inode, int ioflags); 38 struct inode *inode, int ioflags);
42static void nfs_redirty_request(struct nfs_page *req); 39static void nfs_redirty_request(struct nfs_page *req);
@@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
136static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) 133static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
137{ 134{
138 struct inode *inode = page->mapping->host; 135 struct inode *inode = page->mapping->host;
139 loff_t end, i_size = i_size_read(inode); 136 loff_t end, i_size;
140 pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; 137 pgoff_t end_index;
141 138
139 spin_lock(&inode->i_lock);
140 i_size = i_size_read(inode);
141 end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
142 if (i_size > 0 && page->index < end_index) 142 if (i_size > 0 && page->index < end_index)
143 return; 143 goto out;
144 end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); 144 end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
145 if (i_size >= end) 145 if (i_size >= end)
146 return; 146 goto out;
147 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
148 i_size_write(inode, end); 147 i_size_write(inode, end);
148 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
149out:
150 spin_unlock(&inode->i_lock);
149} 151}
150 152
151/* A writeback failed: mark the page as bad, and invalidate the page cache */ 153/* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
169 SetPageUptodate(page); 171 SetPageUptodate(page);
170} 172}
171 173
172static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
173 unsigned int offset, unsigned int count)
174{
175 struct nfs_page *req;
176 int ret;
177
178 for (;;) {
179 req = nfs_update_request(ctx, page, offset, count);
180 if (!IS_ERR(req))
181 break;
182 ret = PTR_ERR(req);
183 if (ret != -EBUSY)
184 return ret;
185 ret = nfs_wb_page(page->mapping->host, page);
186 if (ret != 0)
187 return ret;
188 }
189 /* Update file length */
190 nfs_grow_file(page, offset, count);
191 nfs_clear_page_tag_locked(req);
192 return 0;
193}
194
195static int wb_priority(struct writeback_control *wbc) 174static int wb_priority(struct writeback_control *wbc)
196{ 175{
197 if (wbc->for_reclaim) 176 if (wbc->for_reclaim)
@@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
268 return ret; 247 return ret;
269 spin_lock(&inode->i_lock); 248 spin_lock(&inode->i_lock);
270 } 249 }
271 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 250 if (test_bit(PG_CLEAN, &req->wb_flags)) {
272 /* This request is marked for commit */
273 spin_unlock(&inode->i_lock); 251 spin_unlock(&inode->i_lock);
274 nfs_clear_page_tag_locked(req); 252 BUG();
275 nfs_pageio_complete(pgio);
276 return 0;
277 } 253 }
278 if (nfs_set_page_writeback(page) != 0) { 254 if (nfs_set_page_writeback(page) != 0) {
279 spin_unlock(&inode->i_lock); 255 spin_unlock(&inode->i_lock);
@@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
355/* 331/*
356 * Insert a write request into an inode 332 * Insert a write request into an inode
357 */ 333 */
358static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) 334static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
359{ 335{
360 struct nfs_inode *nfsi = NFS_I(inode); 336 struct nfs_inode *nfsi = NFS_I(inode);
361 int error; 337 int error;
362 338
339 error = radix_tree_preload(GFP_NOFS);
340 if (error != 0)
341 goto out;
342
343 /* Lock the request! */
344 nfs_lock_request_dontget(req);
345
346 spin_lock(&inode->i_lock);
363 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); 347 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
364 BUG_ON(error); 348 BUG_ON(error);
365 if (!nfsi->npages) { 349 if (!nfsi->npages) {
@@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
373 kref_get(&req->wb_kref); 357 kref_get(&req->wb_kref);
374 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, 358 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
375 NFS_PAGE_TAG_LOCKED); 359 NFS_PAGE_TAG_LOCKED);
360 spin_unlock(&inode->i_lock);
361 radix_tree_preload_end();
362out:
363 return error;
376} 364}
377 365
378/* 366/*
@@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
405 __set_page_dirty_nobuffers(req->wb_page); 393 __set_page_dirty_nobuffers(req->wb_page);
406} 394}
407 395
408/*
409 * Check if a request is dirty
410 */
411static inline int
412nfs_dirty_request(struct nfs_page *req)
413{
414 struct page *page = req->wb_page;
415
416 if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
417 return 0;
418 return !PageWriteback(page);
419}
420
421#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 396#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
422/* 397/*
423 * Add a request to the inode's commit list. 398 * Add a request to the inode's commit list.
@@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req)
430 405
431 spin_lock(&inode->i_lock); 406 spin_lock(&inode->i_lock);
432 nfsi->ncommit++; 407 nfsi->ncommit++;
433 set_bit(PG_NEED_COMMIT, &(req)->wb_flags); 408 set_bit(PG_CLEAN, &(req)->wb_flags);
434 radix_tree_tag_set(&nfsi->nfs_page_tree, 409 radix_tree_tag_set(&nfsi->nfs_page_tree,
435 req->wb_index, 410 req->wb_index,
436 NFS_PAGE_TAG_COMMIT); 411 NFS_PAGE_TAG_COMMIT);
@@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req)
440 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 415 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
441} 416}
442 417
418static int
419nfs_clear_request_commit(struct nfs_page *req)
420{
421 struct page *page = req->wb_page;
422
423 if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
424 dec_zone_page_state(page, NR_UNSTABLE_NFS);
425 dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
426 return 1;
427 }
428 return 0;
429}
430
443static inline 431static inline
444int nfs_write_need_commit(struct nfs_write_data *data) 432int nfs_write_need_commit(struct nfs_write_data *data)
445{ 433{
@@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
449static inline 437static inline
450int nfs_reschedule_unstable_write(struct nfs_page *req) 438int nfs_reschedule_unstable_write(struct nfs_page *req)
451{ 439{
452 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 440 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
453 nfs_mark_request_commit(req); 441 nfs_mark_request_commit(req);
454 return 1; 442 return 1;
455 } 443 }
@@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req)
465{ 453{
466} 454}
467 455
456static inline int
457nfs_clear_request_commit(struct nfs_page *req)
458{
459 return 0;
460}
461
468static inline 462static inline
469int nfs_write_need_commit(struct nfs_write_data *data) 463int nfs_write_need_commit(struct nfs_write_data *data)
470{ 464{
@@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
522 516
523 while(!list_empty(head)) { 517 while(!list_empty(head)) {
524 req = nfs_list_entry(head->next); 518 req = nfs_list_entry(head->next);
525 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
526 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
527 BDI_RECLAIMABLE);
528 nfs_list_remove_request(req); 519 nfs_list_remove_request(req);
529 clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); 520 nfs_clear_request_commit(req);
530 nfs_inode_remove_request(req); 521 nfs_inode_remove_request(req);
531 nfs_unlock_request(req); 522 nfs_unlock_request(req);
532 } 523 }
@@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
564#endif 555#endif
565 556
566/* 557/*
567 * Try to update any existing write request, or create one if there is none. 558 * Search for an existing write request, and attempt to update
568 * In order to match, the request's credentials must match those of 559 * it to reflect a new dirty region on a given page.
569 * the calling process.
570 * 560 *
571 * Note: Should always be called with the Page Lock held! 561 * If the attempt fails, then the existing request is flushed out
562 * to disk.
572 */ 563 */
573static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, 564static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
574 struct page *page, unsigned int offset, unsigned int bytes) 565 struct page *page,
566 unsigned int offset,
567 unsigned int bytes)
575{ 568{
576 struct address_space *mapping = page->mapping; 569 struct nfs_page *req;
577 struct inode *inode = mapping->host; 570 unsigned int rqend;
578 struct nfs_page *req, *new = NULL; 571 unsigned int end;
579 pgoff_t rqend, end; 572 int error;
573
574 if (!PagePrivate(page))
575 return NULL;
580 576
581 end = offset + bytes; 577 end = offset + bytes;
578 spin_lock(&inode->i_lock);
582 579
583 for (;;) { 580 for (;;) {
584 /* Loop over all inode entries and see if we find 581 req = nfs_page_find_request_locked(page);
585 * A request for the page we wish to update 582 if (req == NULL)
583 goto out_unlock;
584
585 rqend = req->wb_offset + req->wb_bytes;
586 /*
587 * Tell the caller to flush out the request if
588 * the offsets are non-contiguous.
589 * Note: nfs_flush_incompatible() will already
590 * have flushed out requests having wrong owners.
586 */ 591 */
587 if (new) { 592 if (offset > rqend
588 if (radix_tree_preload(GFP_NOFS)) { 593 || end < req->wb_offset)
589 nfs_release_request(new); 594 goto out_flushme;
590 return ERR_PTR(-ENOMEM);
591 }
592 }
593 595
594 spin_lock(&inode->i_lock); 596 if (nfs_set_page_tag_locked(req))
595 req = nfs_page_find_request_locked(page);
596 if (req) {
597 if (!nfs_set_page_tag_locked(req)) {
598 int error;
599
600 spin_unlock(&inode->i_lock);
601 error = nfs_wait_on_request(req);
602 nfs_release_request(req);
603 if (error < 0) {
604 if (new) {
605 radix_tree_preload_end();
606 nfs_release_request(new);
607 }
608 return ERR_PTR(error);
609 }
610 continue;
611 }
612 spin_unlock(&inode->i_lock);
613 if (new) {
614 radix_tree_preload_end();
615 nfs_release_request(new);
616 }
617 break; 597 break;
618 }
619 598
620 if (new) { 599 /* The request is locked, so wait and then retry */
621 nfs_lock_request_dontget(new);
622 nfs_inode_add_request(inode, new);
623 spin_unlock(&inode->i_lock);
624 radix_tree_preload_end();
625 req = new;
626 goto zero_page;
627 }
628 spin_unlock(&inode->i_lock); 600 spin_unlock(&inode->i_lock);
629 601 error = nfs_wait_on_request(req);
630 new = nfs_create_request(ctx, inode, page, offset, bytes); 602 nfs_release_request(req);
631 if (IS_ERR(new)) 603 if (error != 0)
632 return new; 604 goto out_err;
605 spin_lock(&inode->i_lock);
633 } 606 }
634 607
635 /* We have a request for our page. 608 if (nfs_clear_request_commit(req))
636 * If the creds don't match, or the 609 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
637 * page addresses don't match, 610 req->wb_index, NFS_PAGE_TAG_COMMIT);
638 * tell the caller to wait on the conflicting
639 * request.
640 */
641 rqend = req->wb_offset + req->wb_bytes;
642 if (req->wb_context != ctx
643 || req->wb_page != page
644 || !nfs_dirty_request(req)
645 || offset > rqend || end < req->wb_offset) {
646 nfs_clear_page_tag_locked(req);
647 return ERR_PTR(-EBUSY);
648 }
649 611
650 /* Okay, the request matches. Update the region */ 612 /* Okay, the request matches. Update the region */
651 if (offset < req->wb_offset) { 613 if (offset < req->wb_offset) {
652 req->wb_offset = offset; 614 req->wb_offset = offset;
653 req->wb_pgbase = offset; 615 req->wb_pgbase = offset;
654 req->wb_bytes = max(end, rqend) - req->wb_offset;
655 goto zero_page;
656 } 616 }
657
658 if (end > rqend) 617 if (end > rqend)
659 req->wb_bytes = end - req->wb_offset; 618 req->wb_bytes = end - req->wb_offset;
660 619 else
620 req->wb_bytes = rqend - req->wb_offset;
621out_unlock:
622 spin_unlock(&inode->i_lock);
661 return req; 623 return req;
662zero_page: 624out_flushme:
663 /* If this page might potentially be marked as up to date, 625 spin_unlock(&inode->i_lock);
664 * then we need to zero any uninitalised data. */ 626 nfs_release_request(req);
665 if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE 627 error = nfs_wb_page(inode, page);
666 && !PageUptodate(req->wb_page)) 628out_err:
667 zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE); 629 return ERR_PTR(error);
630}
631
632/*
633 * Try to update an existing write request, or create one if there is none.
634 *
635 * Note: Should always be called with the Page Lock held to prevent races
636 * if we have to add a new request. Also assumes that the caller has
637 * already called nfs_flush_incompatible() if necessary.
638 */
639static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
640 struct page *page, unsigned int offset, unsigned int bytes)
641{
642 struct inode *inode = page->mapping->host;
643 struct nfs_page *req;
644 int error;
645
646 req = nfs_try_to_update_request(inode, page, offset, bytes);
647 if (req != NULL)
648 goto out;
649 req = nfs_create_request(ctx, inode, page, offset, bytes);
650 if (IS_ERR(req))
651 goto out;
652 error = nfs_inode_add_request(inode, req);
653 if (error != 0) {
654 nfs_release_request(req);
655 req = ERR_PTR(error);
656 }
657out:
668 return req; 658 return req;
669} 659}
670 660
661static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
662 unsigned int offset, unsigned int count)
663{
664 struct nfs_page *req;
665
666 req = nfs_setup_write_request(ctx, page, offset, count);
667 if (IS_ERR(req))
668 return PTR_ERR(req);
669 /* Update file length */
670 nfs_grow_file(page, offset, count);
671 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
672 nfs_clear_page_tag_locked(req);
673 return 0;
674}
675
671int nfs_flush_incompatible(struct file *file, struct page *page) 676int nfs_flush_incompatible(struct file *file, struct page *page)
672{ 677{
673 struct nfs_open_context *ctx = nfs_file_open_context(file); 678 struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
685 req = nfs_page_find_request(page); 690 req = nfs_page_find_request(page);
686 if (req == NULL) 691 if (req == NULL)
687 return 0; 692 return 0;
688 do_flush = req->wb_page != page || req->wb_context != ctx 693 do_flush = req->wb_page != page || req->wb_context != ctx;
689 || !nfs_dirty_request(req);
690 nfs_release_request(req); 694 nfs_release_request(req);
691 if (!do_flush) 695 if (!do_flush)
692 return 0; 696 return 0;
@@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page,
721 725
722 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); 726 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
723 727
724 dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", 728 dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
725 file->f_path.dentry->d_parent->d_name.name, 729 file->f_path.dentry->d_parent->d_name.name,
726 file->f_path.dentry->d_name.name, count, 730 file->f_path.dentry->d_name.name, count,
727 (long long)(page_offset(page) +offset)); 731 (long long)(page_offset(page) + offset));
728 732
729 /* If we're not using byte range locks, and we know the page 733 /* If we're not using byte range locks, and we know the page
730 * is up to date, it may be more efficient to extend the write 734 * is up to date, it may be more efficient to extend the write
@@ -744,7 +748,7 @@ int nfs_updatepage(struct file *file, struct page *page,
744 else 748 else
745 __set_page_dirty_nobuffers(page); 749 __set_page_dirty_nobuffers(page);
746 750
747 dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", 751 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
748 status, (long long)i_size_read(inode)); 752 status, (long long)i_size_read(inode));
749 return status; 753 return status;
750} 754}
@@ -752,12 +756,7 @@ int nfs_updatepage(struct file *file, struct page *page,
752static void nfs_writepage_release(struct nfs_page *req) 756static void nfs_writepage_release(struct nfs_page *req)
753{ 757{
754 758
755 if (PageError(req->wb_page)) { 759 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
756 nfs_end_page_writeback(req->wb_page);
757 nfs_inode_remove_request(req);
758 } else if (!nfs_reschedule_unstable_write(req)) {
759 /* Set the PG_uptodate flag */
760 nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
761 nfs_end_page_writeback(req->wb_page); 760 nfs_end_page_writeback(req->wb_page);
762 nfs_inode_remove_request(req); 761 nfs_inode_remove_request(req);
763 } else 762 } else
@@ -834,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
834 NFS_PROTO(inode)->write_setup(data, &msg); 833 NFS_PROTO(inode)->write_setup(data, &msg);
835 834
836 dprintk("NFS: %5u initiated write call " 835 dprintk("NFS: %5u initiated write call "
837 "(req %s/%Ld, %u bytes @ offset %Lu)\n", 836 "(req %s/%lld, %u bytes @ offset %llu)\n",
838 data->task.tk_pid, 837 data->task.tk_pid,
839 inode->i_sb->s_id, 838 inode->i_sb->s_id,
840 (long long)NFS_FILEID(inode), 839 (long long)NFS_FILEID(inode),
@@ -978,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
978static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) 977static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
979{ 978{
980 struct nfs_write_data *data = calldata; 979 struct nfs_write_data *data = calldata;
981 struct nfs_page *req = data->req;
982 980
983 dprintk("NFS: write (%s/%Ld %d@%Ld)", 981 dprintk("NFS: %5u write(%s/%lld %d@%lld)",
984 req->wb_context->path.dentry->d_inode->i_sb->s_id, 982 task->tk_pid,
985 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 983 data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
986 req->wb_bytes, 984 (long long)
987 (long long)req_offset(req)); 985 NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
986 data->req->wb_bytes, (long long)req_offset(data->req));
988 987
989 nfs_writeback_done(task, data); 988 nfs_writeback_done(task, data);
990} 989}
@@ -1058,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata)
1058 1057
1059 nfs_list_remove_request(req); 1058 nfs_list_remove_request(req);
1060 1059
1061 dprintk("NFS: write (%s/%Ld %d@%Ld)", 1060 dprintk("NFS: %5u write (%s/%lld %d@%lld)",
1061 data->task.tk_pid,
1062 req->wb_context->path.dentry->d_inode->i_sb->s_id, 1062 req->wb_context->path.dentry->d_inode->i_sb->s_id,
1063 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 1063 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
1064 req->wb_bytes, 1064 req->wb_bytes,
@@ -1078,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata)
1078 dprintk(" marked for commit\n"); 1078 dprintk(" marked for commit\n");
1079 goto next; 1079 goto next;
1080 } 1080 }
1081 /* Set the PG_uptodate flag? */
1082 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
1083 dprintk(" OK\n"); 1081 dprintk(" OK\n");
1084remove_request: 1082remove_request:
1085 nfs_end_page_writeback(page); 1083 nfs_end_page_writeback(page);
@@ -1133,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1133 static unsigned long complain; 1131 static unsigned long complain;
1134 1132
1135 if (time_before(complain, jiffies)) { 1133 if (time_before(complain, jiffies)) {
1136 dprintk("NFS: faulty NFS server %s:" 1134 dprintk("NFS: faulty NFS server %s:"
1137 " (committed = %d) != (stable = %d)\n", 1135 " (committed = %d) != (stable = %d)\n",
1138 NFS_SERVER(data->inode)->nfs_client->cl_hostname, 1136 NFS_SERVER(data->inode)->nfs_client->cl_hostname,
1139 resp->verf->committed, argp->stable); 1137 resp->verf->committed, argp->stable);
@@ -1297,12 +1295,9 @@ static void nfs_commit_release(void *calldata)
1297 while (!list_empty(&data->pages)) { 1295 while (!list_empty(&data->pages)) {
1298 req = nfs_list_entry(data->pages.next); 1296 req = nfs_list_entry(data->pages.next);
1299 nfs_list_remove_request(req); 1297 nfs_list_remove_request(req);
1300 clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); 1298 nfs_clear_request_commit(req);
1301 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1302 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1303 BDI_RECLAIMABLE);
1304 1299
1305 dprintk("NFS: commit (%s/%Ld %d@%Ld)", 1300 dprintk("NFS: commit (%s/%lld %d@%lld)",
1306 req->wb_context->path.dentry->d_inode->i_sb->s_id, 1301 req->wb_context->path.dentry->d_inode->i_sb->s_id,
1307 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 1302 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
1308 req->wb_bytes, 1303 req->wb_bytes,
@@ -1318,9 +1313,6 @@ static void nfs_commit_release(void *calldata)
1318 * returned by the server against all stored verfs. */ 1313 * returned by the server against all stored verfs. */
1319 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { 1314 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
1320 /* We have a match */ 1315 /* We have a match */
1321 /* Set the PG_uptodate flag */
1322 nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
1323 req->wb_bytes);
1324 nfs_inode_remove_request(req); 1316 nfs_inode_remove_request(req);
1325 dprintk(" OK\n"); 1317 dprintk(" OK\n");
1326 goto next; 1318 goto next;
@@ -1479,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1479 req = nfs_page_find_request(page); 1471 req = nfs_page_find_request(page);
1480 if (req == NULL) 1472 if (req == NULL)
1481 goto out; 1473 goto out;
1482 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 1474 if (test_bit(PG_CLEAN, &req->wb_flags)) {
1483 nfs_release_request(req); 1475 nfs_release_request(req);
1484 break; 1476 break;
1485 } 1477 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c3..702fa577aa6e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -381,7 +381,7 @@ static int do_probe_callback(void *data)
381 .program = &cb_program, 381 .program = &cb_program,
382 .version = nfs_cb_version[1]->number, 382 .version = nfs_cb_version[1]->number,
383 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 383 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
384 .flags = (RPC_CLNT_CREATE_NOPING), 384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 }; 385 };
386 struct rpc_message msg = { 386 struct rpc_message msg = {
387 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 387 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 17964c0505a9..1db080135c6d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -174,10 +174,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
174 * need to use BH_New is when we're extending i_size on a file 174 * need to use BH_New is when we're extending i_size on a file
175 * system which doesn't support holes, in which case BH_New 175 * system which doesn't support holes, in which case BH_New
176 * allows block_prepare_write() to zero. 176 * allows block_prepare_write() to zero.
177 *
178 * If we see this on a sparse file system, then a truncate has
179 * raced us and removed the cluster. In this case, we clear
180 * the buffers dirty and uptodate bits and let the buffer code
181 * ignore it as a hole.
177 */ 182 */
178 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), 183 if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
179 "ino %lu, iblock %llu\n", inode->i_ino, 184 clear_buffer_dirty(bh_result);
180 (unsigned long long)iblock); 185 clear_buffer_uptodate(bh_result);
186 goto bail;
187 }
181 188
182 /* Treat the unwritten extent as a hole for zeroing purposes. */ 189 /* Treat the unwritten extent as a hole for zeroing purposes. */
183 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 190 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f02ccb34604d..443d108211ab 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1489,25 +1489,28 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1489 : NULL; 1489 : NULL;
1490} 1490}
1491 1491
1492static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 1492static int o2hb_heartbeat_group_make_item(struct config_group *group,
1493 const char *name) 1493 const char *name,
1494 struct config_item **new_item)
1494{ 1495{
1495 struct o2hb_region *reg = NULL; 1496 struct o2hb_region *reg = NULL;
1496 struct config_item *ret = NULL; 1497 int ret = 0;
1497 1498
1498 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); 1499 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
1499 if (reg == NULL) 1500 if (reg == NULL) {
1500 goto out; /* ENOMEM */ 1501 ret = -ENOMEM;
1502 goto out;
1503 }
1501 1504
1502 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 1505 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1503 1506
1504 ret = &reg->hr_item; 1507 *new_item = &reg->hr_item;
1505 1508
1506 spin_lock(&o2hb_live_lock); 1509 spin_lock(&o2hb_live_lock);
1507 list_add_tail(&reg->hr_all_item, &o2hb_all_regions); 1510 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1508 spin_unlock(&o2hb_live_lock); 1511 spin_unlock(&o2hb_live_lock);
1509out: 1512out:
1510 if (ret == NULL) 1513 if (ret)
1511 kfree(reg); 1514 kfree(reg);
1512 1515
1513 return ret; 1516 return ret;
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7bf3c0ea7bd9..d8bfa0eb41b2 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -146,8 +146,10 @@ static int nst_seq_show(struct seq_file *seq, void *v)
146 nst->st_task->comm, nst->st_node, 146 nst->st_task->comm, nst->st_node,
147 nst->st_sc, nst->st_id, nst->st_msg_type, 147 nst->st_sc, nst->st_id, nst->st_msg_type,
148 nst->st_msg_key, 148 nst->st_msg_key,
149 nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec, 149 nst->st_sock_time.tv_sec,
150 nst->st_send_time.tv_sec, nst->st_send_time.tv_usec, 150 (unsigned long)nst->st_sock_time.tv_usec,
151 nst->st_send_time.tv_sec,
152 (unsigned long)nst->st_send_time.tv_usec,
151 nst->st_status_time.tv_sec, 153 nst->st_status_time.tv_sec,
152 nst->st_status_time.tv_usec); 154 nst->st_status_time.tv_usec);
153 } 155 }
@@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
274 return sc; /* unused, just needs to be null when done */ 276 return sc; /* unused, just needs to be null when done */
275} 277}
276 278
277#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec 279#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
278 280
279static int sc_seq_show(struct seq_file *seq, void *v) 281static int sc_seq_show(struct seq_file *seq, void *v)
280{ 282{
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cfdb08b484ed..b364b7052e46 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -644,27 +644,32 @@ out:
644 return ret; 644 return ret;
645} 645}
646 646
647static struct config_item *o2nm_node_group_make_item(struct config_group *group, 647static int o2nm_node_group_make_item(struct config_group *group,
648 const char *name) 648 const char *name,
649 struct config_item **new_item)
649{ 650{
650 struct o2nm_node *node = NULL; 651 struct o2nm_node *node = NULL;
651 struct config_item *ret = NULL; 652 int ret = 0;
652 653
653 if (strlen(name) > O2NM_MAX_NAME_LEN) 654 if (strlen(name) > O2NM_MAX_NAME_LEN) {
654 goto out; /* ENAMETOOLONG */ 655 ret = -ENAMETOOLONG;
656 goto out;
657 }
655 658
656 node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); 659 node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
657 if (node == NULL) 660 if (node == NULL) {
658 goto out; /* ENOMEM */ 661 ret = -ENOMEM;
662 goto out;
663 }
659 664
660 strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ 665 strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
661 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); 666 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
662 spin_lock_init(&node->nd_lock); 667 spin_lock_init(&node->nd_lock);
663 668
664 ret = &node->nd_item; 669 *new_item = &node->nd_item;
665 670
666out: 671out:
667 if (ret == NULL) 672 if (ret)
668 kfree(node); 673 kfree(node);
669 674
670 return ret; 675 return ret;
@@ -751,25 +756,31 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro
751} 756}
752#endif 757#endif
753 758
754static struct config_group *o2nm_cluster_group_make_group(struct config_group *group, 759static int o2nm_cluster_group_make_group(struct config_group *group,
755 const char *name) 760 const char *name,
761 struct config_group **new_group)
756{ 762{
757 struct o2nm_cluster *cluster = NULL; 763 struct o2nm_cluster *cluster = NULL;
758 struct o2nm_node_group *ns = NULL; 764 struct o2nm_node_group *ns = NULL;
759 struct config_group *o2hb_group = NULL, *ret = NULL; 765 struct config_group *o2hb_group = NULL;
760 void *defs = NULL; 766 void *defs = NULL;
767 int ret = 0;
761 768
762 /* this runs under the parent dir's i_mutex; there can be only 769 /* this runs under the parent dir's i_mutex; there can be only
763 * one caller in here at a time */ 770 * one caller in here at a time */
764 if (o2nm_single_cluster) 771 if (o2nm_single_cluster) {
765 goto out; /* ENOSPC */ 772 ret = -ENOSPC;
773 goto out;
774 }
766 775
767 cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); 776 cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
768 ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); 777 ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
769 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); 778 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
770 o2hb_group = o2hb_alloc_hb_set(); 779 o2hb_group = o2hb_alloc_hb_set();
771 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) 780 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) {
781 ret = -ENOMEM;
772 goto out; 782 goto out;
783 }
773 784
774 config_group_init_type_name(&cluster->cl_group, name, 785 config_group_init_type_name(&cluster->cl_group, name,
775 &o2nm_cluster_type); 786 &o2nm_cluster_type);
@@ -786,11 +797,11 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
786 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; 797 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
787 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; 798 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
788 799
789 ret = &cluster->cl_group; 800 *new_group = &cluster->cl_group;
790 o2nm_single_cluster = cluster; 801 o2nm_single_cluster = cluster;
791 802
792out: 803out:
793 if (ret == NULL) { 804 if (ret) {
794 kfree(cluster); 805 kfree(cluster);
795 kfree(ns); 806 kfree(ns);
796 o2hb_free_hb_set(o2hb_group); 807 o2hb_free_hb_set(o2hb_group);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 80e20d9f2780..eae3d643a5e4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -31,6 +31,7 @@
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/time.h>
34 35
35#define MLOG_MASK_PREFIX ML_DLM_GLUE 36#define MLOG_MASK_PREFIX ML_DLM_GLUE
36#include <cluster/masklog.h> 37#include <cluster/masklog.h>
@@ -59,6 +60,9 @@ struct ocfs2_mask_waiter {
59 struct completion mw_complete; 60 struct completion mw_complete;
60 unsigned long mw_mask; 61 unsigned long mw_mask;
61 unsigned long mw_goal; 62 unsigned long mw_goal;
63#ifdef CONFIG_OCFS2_FS_STATS
64 unsigned long long mw_lock_start;
65#endif
62}; 66};
63 67
64static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 68static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
@@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
366 spin_unlock(&ocfs2_dlm_tracking_lock); 370 spin_unlock(&ocfs2_dlm_tracking_lock);
367} 371}
368 372
373#ifdef CONFIG_OCFS2_FS_STATS
374static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
375{
376 res->l_lock_num_prmode = 0;
377 res->l_lock_num_prmode_failed = 0;
378 res->l_lock_total_prmode = 0;
379 res->l_lock_max_prmode = 0;
380 res->l_lock_num_exmode = 0;
381 res->l_lock_num_exmode_failed = 0;
382 res->l_lock_total_exmode = 0;
383 res->l_lock_max_exmode = 0;
384 res->l_lock_refresh = 0;
385}
386
387static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
388 struct ocfs2_mask_waiter *mw, int ret)
389{
390 unsigned long long *num, *sum;
391 unsigned int *max, *failed;
392 struct timespec ts = current_kernel_time();
393 unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
394
395 if (level == LKM_PRMODE) {
396 num = &res->l_lock_num_prmode;
397 sum = &res->l_lock_total_prmode;
398 max = &res->l_lock_max_prmode;
399 failed = &res->l_lock_num_prmode_failed;
400 } else if (level == LKM_EXMODE) {
401 num = &res->l_lock_num_exmode;
402 sum = &res->l_lock_total_exmode;
403 max = &res->l_lock_max_exmode;
404 failed = &res->l_lock_num_exmode_failed;
405 } else
406 return;
407
408 (*num)++;
409 (*sum) += time;
410 if (time > *max)
411 *max = time;
412 if (ret)
413 (*failed)++;
414}
415
416static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
417{
418 lockres->l_lock_refresh++;
419}
420
421static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
422{
423 struct timespec ts = current_kernel_time();
424 mw->mw_lock_start = timespec_to_ns(&ts);
425}
426#else
427static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
428{
429}
430static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
431 int level, struct ocfs2_mask_waiter *mw, int ret)
432{
433}
434static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
435{
436}
437static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
438{
439}
440#endif
441
369static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, 442static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
370 struct ocfs2_lock_res *res, 443 struct ocfs2_lock_res *res,
371 enum ocfs2_lock_type type, 444 enum ocfs2_lock_type type,
@@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
385 res->l_flags = OCFS2_LOCK_INITIALIZED; 458 res->l_flags = OCFS2_LOCK_INITIALIZED;
386 459
387 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); 460 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
461
462 ocfs2_init_lock_stats(res);
388} 463}
389 464
390void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) 465void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
1048{ 1123{
1049 INIT_LIST_HEAD(&mw->mw_item); 1124 INIT_LIST_HEAD(&mw->mw_item);
1050 init_completion(&mw->mw_complete); 1125 init_completion(&mw->mw_complete);
1126 ocfs2_init_start_time(mw);
1051} 1127}
1052 1128
1053static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) 1129static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
@@ -1254,6 +1330,7 @@ out:
1254 goto again; 1330 goto again;
1255 mlog_errno(ret); 1331 mlog_errno(ret);
1256 } 1332 }
1333 ocfs2_update_lock_stats(lockres, level, &mw, ret);
1257 1334
1258 mlog_exit(ret); 1335 mlog_exit(ret);
1259 return ret; 1336 return ret;
@@ -1983,6 +2060,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
1983 le32_to_cpu(fe->i_flags)); 2060 le32_to_cpu(fe->i_flags));
1984 2061
1985 ocfs2_refresh_inode(inode, fe); 2062 ocfs2_refresh_inode(inode, fe);
2063 ocfs2_track_lock_refresh(lockres);
1986 } 2064 }
1987 2065
1988 status = 0; 2066 status = 0;
@@ -2267,6 +2345,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2267 2345
2268 if (status < 0) 2346 if (status < 0)
2269 mlog_errno(status); 2347 mlog_errno(status);
2348 ocfs2_track_lock_refresh(lockres);
2270 } 2349 }
2271bail: 2350bail:
2272 mlog_exit(status); 2351 mlog_exit(status);
@@ -2461,7 +2540,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
2461} 2540}
2462 2541
2463/* So that debugfs.ocfs2 can determine which format is being used */ 2542/* So that debugfs.ocfs2 can determine which format is being used */
2464#define OCFS2_DLM_DEBUG_STR_VERSION 1 2543#define OCFS2_DLM_DEBUG_STR_VERSION 2
2465static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) 2544static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2466{ 2545{
2467 int i; 2546 int i;
@@ -2502,6 +2581,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2502 for(i = 0; i < DLM_LVB_LEN; i++) 2581 for(i = 0; i < DLM_LVB_LEN; i++)
2503 seq_printf(m, "0x%x\t", lvb[i]); 2582 seq_printf(m, "0x%x\t", lvb[i]);
2504 2583
2584#ifdef CONFIG_OCFS2_FS_STATS
2585# define lock_num_prmode(_l) (_l)->l_lock_num_prmode
2586# define lock_num_exmode(_l) (_l)->l_lock_num_exmode
2587# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed
2588# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed
2589# define lock_total_prmode(_l) (_l)->l_lock_total_prmode
2590# define lock_total_exmode(_l) (_l)->l_lock_total_exmode
2591# define lock_max_prmode(_l) (_l)->l_lock_max_prmode
2592# define lock_max_exmode(_l) (_l)->l_lock_max_exmode
2593# define lock_refresh(_l) (_l)->l_lock_refresh
2594#else
2595# define lock_num_prmode(_l) (0ULL)
2596# define lock_num_exmode(_l) (0ULL)
2597# define lock_num_prmode_failed(_l) (0)
2598# define lock_num_exmode_failed(_l) (0)
2599# define lock_total_prmode(_l) (0ULL)
2600# define lock_total_exmode(_l) (0ULL)
2601# define lock_max_prmode(_l) (0)
2602# define lock_max_exmode(_l) (0)
2603# define lock_refresh(_l) (0)
2604#endif
2605 /* The following seq_print was added in version 2 of this output */
2606 seq_printf(m, "%llu\t"
2607 "%llu\t"
2608 "%u\t"
2609 "%u\t"
2610 "%llu\t"
2611 "%llu\t"
2612 "%u\t"
2613 "%u\t"
2614 "%u\t",
2615 lock_num_prmode(lockres),
2616 lock_num_exmode(lockres),
2617 lock_num_prmode_failed(lockres),
2618 lock_num_exmode_failed(lockres),
2619 lock_total_prmode(lockres),
2620 lock_total_exmode(lockres),
2621 lock_max_prmode(lockres),
2622 lock_max_exmode(lockres),
2623 lock_refresh(lockres));
2624
2505 /* End the line */ 2625 /* End the line */
2506 seq_printf(m, "\n"); 2626 seq_printf(m, "\n");
2507 return 0; 2627 return 0;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 57e0d30cde98..e8514e8b6ce8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2202,7 +2202,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2202 2202
2203 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2203 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2204 if (ret == -EINVAL) 2204 if (ret == -EINVAL)
2205 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 2205 mlog(0, "generic_file_aio_read returned -EINVAL\n");
2206 2206
2207 /* buffered aio wouldn't have proper lock coverage today */ 2207 /* buffered aio wouldn't have proper lock coverage today */
2208 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2208 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9698338adc39..a8c19cb3cfdd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -329,7 +329,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
329 329
330 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 330 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
331 331
332#ifdef OCFS2_DEBUG_FS 332#ifdef CONFIG_OCFS2_DEBUG_FS
333 status = 1; 333 status = 1;
334#else 334#else
335 status = journal_extend(handle, nblocks); 335 status = journal_extend(handle, nblocks);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index be774bdc8b36..28e492e4ec88 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -498,7 +498,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
498 498
499 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 499 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
500 500
501#ifdef OCFS2_DEBUG_FS 501#ifdef CONFIG_OCFS2_DEBUG_FS
502 if (le32_to_cpu(alloc->id1.bitmap1.i_used) != 502 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
503 ocfs2_local_alloc_count_bits(alloc)) { 503 ocfs2_local_alloc_count_bits(alloc)) {
504 ocfs2_error(osb->sb, "local alloc inode %llu says it has " 504 ocfs2_error(osb->sb, "local alloc inode %llu says it has "
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 31692379c170..1cb814be8ef1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -132,6 +132,18 @@ struct ocfs2_lock_res {
132 wait_queue_head_t l_event; 132 wait_queue_head_t l_event;
133 133
134 struct list_head l_debug_list; 134 struct list_head l_debug_list;
135
136#ifdef CONFIG_OCFS2_FS_STATS
137 unsigned long long l_lock_num_prmode; /* PR acquires */
138 unsigned long long l_lock_num_exmode; /* EX acquires */
139 unsigned int l_lock_num_prmode_failed; /* Failed PR gets */
140 unsigned int l_lock_num_exmode_failed; /* Failed EX gets */
141 unsigned long long l_lock_total_prmode; /* Tot wait for PR */
142 unsigned long long l_lock_total_exmode; /* Tot wait for EX */
143 unsigned int l_lock_max_prmode; /* Max wait for PR */
144 unsigned int l_lock_max_exmode; /* Max wait for EX */
145 unsigned int l_lock_refresh; /* Disk refreshes */
146#endif
135}; 147};
136 148
137struct ocfs2_dlm_debug { 149struct ocfs2_dlm_debug {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 52c426665154..3f1945177629 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -901,7 +901,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
901 * list has a copy per slot. 901 * list has a copy per slot.
902 */ 902 */
903 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) 903 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
904 chars = snprintf(buf, len, 904 chars = snprintf(buf, len, "%s",
905 ocfs2_system_inodes[type].si_name); 905 ocfs2_system_inodes[type].si_name);
906 else 906 else
907 chars = snprintf(buf, len, 907 chars = snprintf(buf, len,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index bd7e0f3acfc7..353fc35c6748 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -550,26 +550,17 @@ static ssize_t ocfs2_control_read(struct file *file,
550 size_t count, 550 size_t count,
551 loff_t *ppos) 551 loff_t *ppos)
552{ 552{
553 char *proto_string = OCFS2_CONTROL_PROTO; 553 ssize_t ret;
554 size_t to_write = 0;
555
556 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
557 return 0;
558
559 to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
560 if (to_write > count)
561 to_write = count;
562 if (copy_to_user(buf, proto_string + *ppos, to_write))
563 return -EFAULT;
564 554
565 *ppos += to_write; 555 ret = simple_read_from_buffer(buf, count, ppos,
556 OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
566 557
567 /* Have we read the whole protocol list? */ 558 /* Have we read the whole protocol list? */
568 if (*ppos >= OCFS2_CONTROL_PROTO_LEN) 559 if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
569 ocfs2_control_set_handshake_state(file, 560 ocfs2_control_set_handshake_state(file,
570 OCFS2_CONTROL_HANDSHAKE_READ); 561 OCFS2_CONTROL_HANDSHAKE_READ);
571 562
572 return to_write; 563 return ret;
573} 564}
574 565
575static int ocfs2_control_release(struct inode *inode, struct file *file) 566static int ocfs2_control_release(struct inode *inode, struct file *file)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index df63ba20ae90..ccecfe5094fa 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1703,7 +1703,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
1703 local = ocfs2_mount_local(osb); 1703 local = ocfs2_mount_local(osb);
1704 1704
1705 /* will play back anything left in the journal. */ 1705 /* will play back anything left in the journal. */
1706 ocfs2_journal_load(osb->journal, local); 1706 status = ocfs2_journal_load(osb->journal, local);
1707 if (status < 0) {
1708 mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
1709 goto finally;
1710 }
1707 1711
1708 if (dirty) { 1712 if (dirty) {
1709 /* recover my local alloc if we didn't unmount cleanly. */ 1713 /* recover my local alloc if we didn't unmount cleanly. */
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
new file mode 100644
index 000000000000..91ceeda7e5bf
--- /dev/null
+++ b/fs/ubifs/Kconfig
@@ -0,0 +1,72 @@
1config UBIFS_FS
2 tristate "UBIFS file system support"
3 select CRC16
4 select CRC32
5 select CRYPTO if UBIFS_FS_ADVANCED_COMPR
6 select CRYPTO if UBIFS_FS_LZO
7 select CRYPTO if UBIFS_FS_ZLIB
8 select CRYPTO_LZO if UBIFS_FS_LZO
9 select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
10 depends on MTD_UBI
11 help
12 UBIFS is a file system for flash devices which works on top of UBI.
13
14config UBIFS_FS_XATTR
15 bool "Extended attributes support"
16 depends on UBIFS_FS
17 help
18 This option enables support of extended attributes.
19
20config UBIFS_FS_ADVANCED_COMPR
21 bool "Advanced compression options"
22 depends on UBIFS_FS
23 help
24 This option allows to explicitly choose which compressions, if any,
25 are enabled in UBIFS. Removing compressors means inbility to read
26 existing file systems.
27
28 If unsure, say 'N'.
29
30config UBIFS_FS_LZO
31 bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
32 depends on UBIFS_FS
33 default y
34 help
35 LZO compressor is generally faster then zlib but compresses worse.
36 Say 'Y' if unsure.
37
38config UBIFS_FS_ZLIB
39 bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
40 depends on UBIFS_FS
41 default y
42 help
43 Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
44
45# Debugging-related stuff
46config UBIFS_FS_DEBUG
47 bool "Enable debugging"
48 depends on UBIFS_FS
49 select DEBUG_FS
50 select KALLSYMS_ALL
51 help
52 This option enables UBIFS debugging.
53
54config UBIFS_FS_DEBUG_MSG_LVL
55 int "Default message level (0 = no extra messages, 3 = lots)"
56 depends on UBIFS_FS_DEBUG
57 default "0"
58 help
59 This controls the amount of debugging messages produced by UBIFS.
60 If reporting bugs, please try to have available a full dump of the
61 messages at level 1 while the misbehaviour was occurring. Level 2
62 may become necessary if level 1 messages were not enough to find the
63 bug. Generally Level 3 should be avoided.
64
65config UBIFS_FS_DEBUG_CHKS
66 bool "Enable extra checks"
67 depends on UBIFS_FS_DEBUG
68 help
69 If extra checks are enabled UBIFS will check the consistency of its
70 internal data structures during operation. However, UBIFS performance
71 is dramatically slower when this option is selected especially if the
72 file system is large.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
new file mode 100644
index 000000000000..80e93c35e496
--- /dev/null
+++ b/fs/ubifs/Makefile
@@ -0,0 +1,9 @@
1obj-$(CONFIG_UBIFS_FS) += ubifs.o
2
3ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
4ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
5ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
6ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
7
8ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
9ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
new file mode 100644
index 000000000000..d81fb9ed2b8e
--- /dev/null
+++ b/fs/ubifs/budget.c
@@ -0,0 +1,731 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the budgeting sub-system which is responsible for UBIFS
25 * space management.
26 *
27 * Factors such as compression, wasted space at the ends of LEBs, space in other
28 * journal heads, the effect of updates on the index, and so on, make it
29 * impossible to accurately predict the amount of space needed. Consequently
30 * approximations are used.
31 */
32
33#include "ubifs.h"
34#include <linux/writeback.h>
35#include <asm/div64.h>
36
37/*
38 * When pessimistic budget calculations say that there is no enough space,
39 * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
40 * or committing. The below constants define maximum number of times UBIFS
41 * repeats the operations.
42 */
43#define MAX_SHRINK_RETRIES 8
44#define MAX_GC_RETRIES 4
45#define MAX_CMT_RETRIES 2
46#define MAX_NOSPC_RETRIES 1
47
48/*
49 * The below constant defines amount of dirty pages which should be written
50 * back at when trying to shrink the liability.
51 */
52#define NR_TO_WRITE 16
53
54/**
55 * struct retries_info - information about re-tries while making free space.
56 * @prev_liability: previous liability
57 * @shrink_cnt: how many times the liability was shrinked
58 * @shrink_retries: count of liability shrink re-tries (increased when
59 * liability does not shrink)
60 * @try_gc: GC should be tried first
61 * @gc_retries: how many times GC was run
62 * @cmt_retries: how many times commit has been done
63 * @nospc_retries: how many times GC returned %-ENOSPC
64 *
65 * Since we consider budgeting to be the fast-path, and this structure has to
66 * be allocated on stack and zeroed out, we make it smaller using bit-fields.
67 */
68struct retries_info {
69 long long prev_liability;
70 unsigned int shrink_cnt;
71 unsigned int shrink_retries:5;
72 unsigned int try_gc:1;
73 unsigned int gc_retries:4;
74 unsigned int cmt_retries:3;
75 unsigned int nospc_retries:1;
76};
77
78/**
79 * shrink_liability - write-back some dirty pages/inodes.
80 * @c: UBIFS file-system description object
81 * @nr_to_write: how many dirty pages to write-back
82 *
83 * This function shrinks UBIFS liability by means of writing back some amount
84 * of dirty inodes and their pages. Returns the amount of pages which were
85 * written back. The returned value does not include dirty inodes which were
86 * synchronized.
87 *
88 * Note, this function synchronizes even VFS inodes which are locked
89 * (@i_mutex) by the caller of the budgeting function, because write-back does
90 * not touch @i_mutex.
91 */
92static int shrink_liability(struct ubifs_info *c, int nr_to_write)
93{
94 int nr_written;
95 struct writeback_control wbc = {
96 .sync_mode = WB_SYNC_NONE,
97 .range_end = LLONG_MAX,
98 .nr_to_write = nr_to_write,
99 };
100
101 generic_sync_sb_inodes(c->vfs_sb, &wbc);
102 nr_written = nr_to_write - wbc.nr_to_write;
103
104 if (!nr_written) {
105 /*
106 * Re-try again but wait on pages/inodes which are being
107 * written-back concurrently (e.g., by pdflush).
108 */
109 memset(&wbc, 0, sizeof(struct writeback_control));
110 wbc.sync_mode = WB_SYNC_ALL;
111 wbc.range_end = LLONG_MAX;
112 wbc.nr_to_write = nr_to_write;
113 generic_sync_sb_inodes(c->vfs_sb, &wbc);
114 nr_written = nr_to_write - wbc.nr_to_write;
115 }
116
117 dbg_budg("%d pages were written back", nr_written);
118 return nr_written;
119}
120
121
122/**
123 * run_gc - run garbage collector.
124 * @c: UBIFS file-system description object
125 *
126 * This function runs garbage collector to make some more free space. Returns
127 * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
128 * negative error code in case of failure.
129 */
130static int run_gc(struct ubifs_info *c)
131{
132 int err, lnum;
133
134 /* Make some free space by garbage-collecting dirty space */
135 down_read(&c->commit_sem);
136 lnum = ubifs_garbage_collect(c, 1);
137 up_read(&c->commit_sem);
138 if (lnum < 0)
139 return lnum;
140
141 /* GC freed one LEB, return it to lprops */
142 dbg_budg("GC freed LEB %d", lnum);
143 err = ubifs_return_leb(c, lnum);
144 if (err)
145 return err;
146 return 0;
147}
148
149/**
150 * make_free_space - make more free space on the file-system.
151 * @c: UBIFS file-system description object
152 * @ri: information about previous invocations of this function
153 *
154 * This function is called when an operation cannot be budgeted because there
155 * is supposedly no free space. But in most cases there is some free space:
156 * o budgeting is pessimistic, so it always budgets more then it is actually
157 * needed, so shrinking the liability is one way to make free space - the
158 * cached data will take less space then it was budgeted for;
159 * o GC may turn some dark space into free space (budgeting treats dark space
160 * as not available);
161 * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
162 *
163 * So this function tries to do the above. Returns %-EAGAIN if some free space
164 * was presumably made and the caller has to re-try budgeting the operation.
165 * Returns %-ENOSPC if it couldn't do more free space, and other negative error
166 * codes on failures.
167 */
168static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
169{
170 int err;
171
172 /*
173 * If we have some dirty pages and inodes (liability), try to write
174 * them back unless this was tried too many times without effect
175 * already.
176 */
177 if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
178 long long liability;
179
180 spin_lock(&c->space_lock);
181 liability = c->budg_idx_growth + c->budg_data_growth +
182 c->budg_dd_growth;
183 spin_unlock(&c->space_lock);
184
185 if (ri->prev_liability >= liability) {
186 /* Liability does not shrink, next time try GC then */
187 ri->shrink_retries += 1;
188 if (ri->gc_retries < MAX_GC_RETRIES)
189 ri->try_gc = 1;
190 dbg_budg("liability did not shrink: retries %d of %d",
191 ri->shrink_retries, MAX_SHRINK_RETRIES);
192 }
193
194 dbg_budg("force write-back (count %d)", ri->shrink_cnt);
195 shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
196
197 ri->prev_liability = liability;
198 ri->shrink_cnt += 1;
199 return -EAGAIN;
200 }
201
202 /*
203 * Try to run garbage collector unless it was already tried too many
204 * times.
205 */
206 if (ri->gc_retries < MAX_GC_RETRIES) {
207 ri->gc_retries += 1;
208 dbg_budg("run GC, retries %d of %d",
209 ri->gc_retries, MAX_GC_RETRIES);
210
211 ri->try_gc = 0;
212 err = run_gc(c);
213 if (!err)
214 return -EAGAIN;
215
216 if (err == -EAGAIN) {
217 dbg_budg("GC asked to commit");
218 err = ubifs_run_commit(c);
219 if (err)
220 return err;
221 return -EAGAIN;
222 }
223
224 if (err != -ENOSPC)
225 return err;
226
227 /*
228 * GC could not make any progress. If this is the first time,
229 * then it makes sense to try to commit, because it might make
230 * some dirty space.
231 */
232 dbg_budg("GC returned -ENOSPC, retries %d",
233 ri->nospc_retries);
234 if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
235 return err;
236 ri->nospc_retries += 1;
237 }
238
239 /* Neither GC nor write-back helped, try to commit */
240 if (ri->cmt_retries < MAX_CMT_RETRIES) {
241 ri->cmt_retries += 1;
242 dbg_budg("run commit, retries %d of %d",
243 ri->cmt_retries, MAX_CMT_RETRIES);
244 err = ubifs_run_commit(c);
245 if (err)
246 return err;
247 return -EAGAIN;
248 }
249 return -ENOSPC;
250}
251
252/**
253 * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
254 * @c: UBIFS file-system description object
255 *
256 * This function calculates and returns the number of eraseblocks which should
257 * be kept for index usage.
258 */
259int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
260{
261 int ret;
262 uint64_t idx_size;
263
264 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
265
266 /* And make sure we have twice the index size of space reserved */
267 idx_size <<= 1;
268
269 /*
270 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
271 * pair, nor similarly the two variables for the new index size, so we
272 * have to do this costly 64-bit division on fast-path.
273 */
274 if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
275 ret = idx_size + 1;
276 else
277 ret = idx_size;
278 /*
279 * The index head is not available for the in-the-gaps method, so add an
280 * extra LEB to compensate.
281 */
282 ret += 1;
283 /*
284 * At present the index needs at least 2 LEBs: one for the index head
285 * and one for in-the-gaps method (which currently does not cater for
286 * the index head and so excludes it from consideration).
287 */
288 if (ret < 2)
289 ret = 2;
290 return ret;
291}
292
293/**
294 * ubifs_calc_available - calculate available FS space.
295 * @c: UBIFS file-system description object
296 * @min_idx_lebs: minimum number of LEBs reserved for the index
297 *
298 * This function calculates and returns amount of FS space available for use.
299 */
300long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
301{
302 int subtract_lebs;
303 long long available;
304
305 /*
306 * Force the amount available to the total size reported if the used
307 * space is zero.
308 */
309 if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
310 c->budg_data_growth + c->budg_dd_growth == 0) {
311 /* Do the same calculation as for c->block_cnt */
312 available = c->main_lebs - 2;
313 available *= c->leb_size - c->dark_wm;
314 return available;
315 }
316
317 available = c->main_bytes - c->lst.total_used;
318
319 /*
320 * Now 'available' contains theoretically available flash space
321 * assuming there is no index, so we have to subtract the space which
322 * is reserved for the index.
323 */
324 subtract_lebs = min_idx_lebs;
325
326 /* Take into account that GC reserves one LEB for its own needs */
327 subtract_lebs += 1;
328
329 /*
330 * The GC journal head LEB is not really accessible. And since
331 * different write types go to different heads, we may count only on
332 * one head's space.
333 */
334 subtract_lebs += c->jhead_cnt - 1;
335
336 /* We also reserve one LEB for deletions, which bypass budgeting */
337 subtract_lebs += 1;
338
339 available -= (long long)subtract_lebs * c->leb_size;
340
341 /* Subtract the dead space which is not available for use */
342 available -= c->lst.total_dead;
343
344 /*
345 * Subtract dark space, which might or might not be usable - it depends
346 * on the data which we have on the media and which will be written. If
347 * this is a lot of uncompressed or not-compressible data, the dark
348 * space cannot be used.
349 */
350 available -= c->lst.total_dark;
351
352 /*
353 * However, there is more dark space. The index may be bigger than
354 * @min_idx_lebs. Those extra LEBs are assumed to be available, but
355 * their dark space is not included in total_dark, so it is subtracted
356 * here.
357 */
358 if (c->lst.idx_lebs > min_idx_lebs) {
359 subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
360 available -= subtract_lebs * c->dark_wm;
361 }
362
363 /* The calculations are rough and may end up with a negative number */
364 return available > 0 ? available : 0;
365}
366
367/**
368 * can_use_rp - check whether the user is allowed to use reserved pool.
369 * @c: UBIFS file-system description object
370 *
371 * UBIFS has so-called "reserved pool" which is flash space reserved
372 * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
373 * This function checks whether current user is allowed to use reserved pool.
374 * Returns %1 current user is allowed to use reserved pool and %0 otherwise.
375 */
376static int can_use_rp(struct ubifs_info *c)
377{
378 if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
379 (c->rp_gid != 0 && in_group_p(c->rp_gid)))
380 return 1;
381 return 0;
382}
383
384/**
385 * do_budget_space - reserve flash space for index and data growth.
386 * @c: UBIFS file-system description object
387 *
388 * This function makes sure UBIFS has enough free eraseblocks for index growth
389 * and data.
390 *
391 * When budgeting index space, UBIFS reserves twice as more LEBs as the index
392 * would take if it was consolidated and written to the flash. This guarantees
393 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
394 * be able to commit dirty index. So this function basically adds amount of
395 * budgeted index space to the size of the current index, multiplies this by 2,
396 * and makes sure this does not exceed the amount of free eraseblocks.
397 *
398 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
399 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
400 * be large, because UBIFS does not do any index consolidation as long as
401 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
402 * will contain a lot of dirt.
403 * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
404 * consolidated to take up to @c->min_idx_lebs LEBs.
405 *
406 * This function returns zero in case of success, and %-ENOSPC in case of
407 * failure.
408 */
409static int do_budget_space(struct ubifs_info *c)
410{
411 long long outstanding, available;
412 int lebs, rsvd_idx_lebs, min_idx_lebs;
413
414 /* First budget index space */
415 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
416
417 /* Now 'min_idx_lebs' contains number of LEBs to reserve */
418 if (min_idx_lebs > c->lst.idx_lebs)
419 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
420 else
421 rsvd_idx_lebs = 0;
422
423 /*
424 * The number of LEBs that are available to be used by the index is:
425 *
426 * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
427 * @c->lst.taken_empty_lebs
428 *
429 * @empty_lebs are available because they are empty. @freeable_cnt are
430 * available because they contain only free and dirty space and the
431 * index allocation always occurs after wbufs are synch'ed.
432 * @idx_gc_cnt are available because they are index LEBs that have been
433 * garbage collected (including trivial GC) and are awaiting the commit
434 * before they can be unmapped - note that the in-the-gaps method will
435 * grab these if it needs them. @taken_empty_lebs are empty_lebs that
436 * have already been allocated for some purpose (also includes those
437 * LEBs on the @idx_gc list).
438 *
439 * Note, @taken_empty_lebs may temporarily be higher by one because of
440 * the way we serialize LEB allocations and budgeting. See a comment in
441 * 'ubifs_find_free_space()'.
442 */
443 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
444 c->lst.taken_empty_lebs;
445 if (unlikely(rsvd_idx_lebs > lebs)) {
446 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
447 "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
448 rsvd_idx_lebs);
449 return -ENOSPC;
450 }
451
452 available = ubifs_calc_available(c, min_idx_lebs);
453 outstanding = c->budg_data_growth + c->budg_dd_growth;
454
455 if (unlikely(available < outstanding)) {
456 dbg_budg("out of data space: available %lld, outstanding %lld",
457 available, outstanding);
458 return -ENOSPC;
459 }
460
461 if (available - outstanding <= c->rp_size && !can_use_rp(c))
462 return -ENOSPC;
463
464 c->min_idx_lebs = min_idx_lebs;
465 return 0;
466}
467
468/**
469 * calc_idx_growth - calculate approximate index growth from budgeting request.
470 * @c: UBIFS file-system description object
471 * @req: budgeting request
472 *
473 * For now we assume each new node adds one znode. But this is rather poor
474 * approximation, though.
475 */
476static int calc_idx_growth(const struct ubifs_info *c,
477 const struct ubifs_budget_req *req)
478{
479 int znodes;
480
481 znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
482 req->new_dent;
483 return znodes * c->max_idx_node_sz;
484}
485
486/**
487 * calc_data_growth - calculate approximate amount of new data from budgeting
488 * request.
489 * @c: UBIFS file-system description object
490 * @req: budgeting request
491 */
492static int calc_data_growth(const struct ubifs_info *c,
493 const struct ubifs_budget_req *req)
494{
495 int data_growth;
496
497 data_growth = req->new_ino ? c->inode_budget : 0;
498 if (req->new_page)
499 data_growth += c->page_budget;
500 if (req->new_dent)
501 data_growth += c->dent_budget;
502 data_growth += req->new_ino_d;
503 return data_growth;
504}
505
506/**
507 * calc_dd_growth - calculate approximate amount of data which makes other data
508 * dirty from budgeting request.
509 * @c: UBIFS file-system description object
510 * @req: budgeting request
511 */
512static int calc_dd_growth(const struct ubifs_info *c,
513 const struct ubifs_budget_req *req)
514{
515 int dd_growth;
516
517 dd_growth = req->dirtied_page ? c->page_budget : 0;
518
519 if (req->dirtied_ino)
520 dd_growth += c->inode_budget << (req->dirtied_ino - 1);
521 if (req->mod_dent)
522 dd_growth += c->dent_budget;
523 dd_growth += req->dirtied_ino_d;
524 return dd_growth;
525}
526
527/**
528 * ubifs_budget_space - ensure there is enough space to complete an operation.
529 * @c: UBIFS file-system description object
530 * @req: budget request
531 *
532 * This function allocates budget for an operation. It uses pessimistic
533 * approximation of how much flash space the operation needs. The goal of this
534 * function is to make sure UBIFS always has flash space to flush all dirty
535 * pages, dirty inodes, and dirty znodes (liability). This function may force
536 * commit, garbage-collection or write-back. Returns zero in case of success,
537 * %-ENOSPC if there is no free space and other negative error codes in case of
538 * failures.
539 */
540int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
541{
542 int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
543 int err, idx_growth, data_growth, dd_growth;
544 struct retries_info ri;
545
546 ubifs_assert(req->dirtied_ino <= 4);
547 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
548
549 data_growth = calc_data_growth(c, req);
550 dd_growth = calc_dd_growth(c, req);
551 if (!data_growth && !dd_growth)
552 return 0;
553 idx_growth = calc_idx_growth(c, req);
554 memset(&ri, 0, sizeof(struct retries_info));
555
556again:
557 spin_lock(&c->space_lock);
558 ubifs_assert(c->budg_idx_growth >= 0);
559 ubifs_assert(c->budg_data_growth >= 0);
560 ubifs_assert(c->budg_dd_growth >= 0);
561
562 if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
563 dbg_budg("no space");
564 spin_unlock(&c->space_lock);
565 return -ENOSPC;
566 }
567
568 c->budg_idx_growth += idx_growth;
569 c->budg_data_growth += data_growth;
570 c->budg_dd_growth += dd_growth;
571
572 err = do_budget_space(c);
573 if (likely(!err)) {
574 req->idx_growth = idx_growth;
575 req->data_growth = data_growth;
576 req->dd_growth = dd_growth;
577 spin_unlock(&c->space_lock);
578 return 0;
579 }
580
581 /* Restore the old values */
582 c->budg_idx_growth -= idx_growth;
583 c->budg_data_growth -= data_growth;
584 c->budg_dd_growth -= dd_growth;
585 spin_unlock(&c->space_lock);
586
587 if (req->fast) {
588 dbg_budg("no space for fast budgeting");
589 return err;
590 }
591
592 err = make_free_space(c, &ri);
593 if (err == -EAGAIN) {
594 dbg_budg("try again");
595 cond_resched();
596 goto again;
597 } else if (err == -ENOSPC) {
598 dbg_budg("FS is full, -ENOSPC");
599 c->nospace = 1;
600 if (can_use_rp(c) || c->rp_size == 0)
601 c->nospace_rp = 1;
602 smp_wmb();
603 } else
604 ubifs_err("cannot budget space, error %d", err);
605 return err;
606}
607
608/**
609 * ubifs_release_budget - release budgeted free space.
610 * @c: UBIFS file-system description object
611 * @req: budget request
612 *
613 * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
614 * since the index changes (which were budgeted for in @req->idx_growth) will
615 * only be written to the media on commit, this function moves the index budget
616 * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
617 * zeroed by the commit operation.
618 */
619void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
620{
621 ubifs_assert(req->dirtied_ino <= 4);
622 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
623 if (!req->recalculate) {
624 ubifs_assert(req->idx_growth >= 0);
625 ubifs_assert(req->data_growth >= 0);
626 ubifs_assert(req->dd_growth >= 0);
627 }
628
629 if (req->recalculate) {
630 req->data_growth = calc_data_growth(c, req);
631 req->dd_growth = calc_dd_growth(c, req);
632 req->idx_growth = calc_idx_growth(c, req);
633 }
634
635 if (!req->data_growth && !req->dd_growth)
636 return;
637
638 c->nospace = c->nospace_rp = 0;
639 smp_wmb();
640
641 spin_lock(&c->space_lock);
642 c->budg_idx_growth -= req->idx_growth;
643 c->budg_uncommitted_idx += req->idx_growth;
644 c->budg_data_growth -= req->data_growth;
645 c->budg_dd_growth -= req->dd_growth;
646 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
647
648 ubifs_assert(c->budg_idx_growth >= 0);
649 ubifs_assert(c->budg_data_growth >= 0);
650 ubifs_assert(c->min_idx_lebs < c->main_lebs);
651 spin_unlock(&c->space_lock);
652}
653
654/**
655 * ubifs_convert_page_budget - convert budget of a new page.
656 * @c: UBIFS file-system description object
657 *
658 * This function converts budget which was allocated for a new page of data to
659 * the budget of changing an existing page of data. The latter is smaller then
660 * the former, so this function only does simple re-calculation and does not
661 * involve any write-back.
662 */
663void ubifs_convert_page_budget(struct ubifs_info *c)
664{
665 spin_lock(&c->space_lock);
666 /* Release the index growth reservation */
667 c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
668 /* Release the data growth reservation */
669 c->budg_data_growth -= c->page_budget;
670 /* Increase the dirty data growth reservation instead */
671 c->budg_dd_growth += c->page_budget;
672 /* And re-calculate the indexing space reservation */
673 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
674 spin_unlock(&c->space_lock);
675}
676
677/**
678 * ubifs_release_dirty_inode_budget - release dirty inode budget.
679 * @c: UBIFS file-system description object
680 * @ui: UBIFS inode to release the budget for
681 *
682 * This function releases budget corresponding to a dirty inode. It is usually
683 * called when after the inode has been written to the media and marked as
684 * clean.
685 */
686void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
687 struct ubifs_inode *ui)
688{
689 struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
690 .dirtied_ino_d = ui->data_len};
691
692 ubifs_release_budget(c, &req);
693}
694
695/**
696 * ubifs_budg_get_free_space - return amount of free space.
697 * @c: UBIFS file-system description object
698 *
699 * This function returns amount of free space on the file-system.
700 */
701long long ubifs_budg_get_free_space(struct ubifs_info *c)
702{
703 int min_idx_lebs, rsvd_idx_lebs;
704 long long available, outstanding, free;
705
706 /* Do exactly the same calculations as in 'do_budget_space()' */
707 spin_lock(&c->space_lock);
708 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
709
710 if (min_idx_lebs > c->lst.idx_lebs)
711 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
712 else
713 rsvd_idx_lebs = 0;
714
715 if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
716 - c->lst.taken_empty_lebs) {
717 spin_unlock(&c->space_lock);
718 return 0;
719 }
720
721 available = ubifs_calc_available(c, min_idx_lebs);
722 outstanding = c->budg_data_growth + c->budg_dd_growth;
723 c->min_idx_lebs = min_idx_lebs;
724 spin_unlock(&c->space_lock);
725
726 if (available > outstanding)
727 free = ubifs_reported_space(c, available - outstanding);
728 else
729 free = 0;
730 return free;
731}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
new file mode 100644
index 000000000000..3b516316c9b3
--- /dev/null
+++ b/fs/ubifs/commit.c
@@ -0,0 +1,677 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements functions that manage the running of the commit process.
25 * Each affected module has its own functions to accomplish their part in the
26 * commit and those functions are called here.
27 *
28 * The commit is the process whereby all updates to the index and LEB properties
29 * are written out together and the journal becomes empty. This keeps the
30 * file system consistent - at all times the state can be recreated by reading
31 * the index and LEB properties and then replaying the journal.
32 *
33 * The commit is split into two parts named "commit start" and "commit end".
34 * During commit start, the commit process has exclusive access to the journal
35 * by holding the commit semaphore down for writing. As few I/O operations as
36 * possible are performed during commit start, instead the nodes that are to be
37 * written are merely identified. During commit end, the commit semaphore is no
38 * longer held and the journal is again in operation, allowing users to continue
39 * to use the file system while the bulk of the commit I/O is performed. The
40 * purpose of this two-step approach is to prevent the commit from causing any
41 * latency blips. Note that in any case, the commit does not prevent lookups
42 * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
43 * cache.
44 */
45
46#include <linux/freezer.h>
47#include <linux/kthread.h>
48#include "ubifs.h"
49
50/**
51 * do_commit - commit the journal.
52 * @c: UBIFS file-system description object
53 *
54 * This function implements UBIFS commit. It has to be called with commit lock
55 * locked. Returns zero in case of success and a negative error code in case of
56 * failure.
57 */
58static int do_commit(struct ubifs_info *c)
59{
60 int err, new_ltail_lnum, old_ltail_lnum, i;
61 struct ubifs_zbranch zroot;
62 struct ubifs_lp_stats lst;
63
64 dbg_cmt("start");
65 if (c->ro_media) {
66 err = -EROFS;
67 goto out_up;
68 }
69
70 /* Sync all write buffers (necessary for recovery) */
71 for (i = 0; i < c->jhead_cnt; i++) {
72 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
73 if (err)
74 goto out_up;
75 }
76
77 err = ubifs_gc_start_commit(c);
78 if (err)
79 goto out_up;
80 err = dbg_check_lprops(c);
81 if (err)
82 goto out_up;
83 err = ubifs_log_start_commit(c, &new_ltail_lnum);
84 if (err)
85 goto out_up;
86 err = ubifs_tnc_start_commit(c, &zroot);
87 if (err)
88 goto out_up;
89 err = ubifs_lpt_start_commit(c);
90 if (err)
91 goto out_up;
92 err = ubifs_orphan_start_commit(c);
93 if (err)
94 goto out_up;
95
96 ubifs_get_lp_stats(c, &lst);
97
98 up_write(&c->commit_sem);
99
100 err = ubifs_tnc_end_commit(c);
101 if (err)
102 goto out;
103 err = ubifs_lpt_end_commit(c);
104 if (err)
105 goto out;
106 err = ubifs_orphan_end_commit(c);
107 if (err)
108 goto out;
109 old_ltail_lnum = c->ltail_lnum;
110 err = ubifs_log_end_commit(c, new_ltail_lnum);
111 if (err)
112 goto out;
113 err = dbg_check_old_index(c, &zroot);
114 if (err)
115 goto out;
116
117 mutex_lock(&c->mst_mutex);
118 c->mst_node->cmt_no = cpu_to_le64(++c->cmt_no);
119 c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
120 c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
121 c->mst_node->root_offs = cpu_to_le32(zroot.offs);
122 c->mst_node->root_len = cpu_to_le32(zroot.len);
123 c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
124 c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
125 c->mst_node->index_size = cpu_to_le64(c->old_idx_sz);
126 c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
127 c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
128 c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
129 c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs);
130 c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum);
131 c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs);
132 c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum);
133 c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs);
134 c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum);
135 c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs);
136 c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs);
137 c->mst_node->total_free = cpu_to_le64(lst.total_free);
138 c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
139 c->mst_node->total_used = cpu_to_le64(lst.total_used);
140 c->mst_node->total_dead = cpu_to_le64(lst.total_dead);
141 c->mst_node->total_dark = cpu_to_le64(lst.total_dark);
142 if (c->no_orphs)
143 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
144 else
145 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
146 err = ubifs_write_master(c);
147 mutex_unlock(&c->mst_mutex);
148 if (err)
149 goto out;
150
151 err = ubifs_log_post_commit(c, old_ltail_lnum);
152 if (err)
153 goto out;
154 err = ubifs_gc_end_commit(c);
155 if (err)
156 goto out;
157 err = ubifs_lpt_post_commit(c);
158 if (err)
159 goto out;
160
161 spin_lock(&c->cs_lock);
162 c->cmt_state = COMMIT_RESTING;
163 wake_up(&c->cmt_wq);
164 dbg_cmt("commit end");
165 spin_unlock(&c->cs_lock);
166
167 return 0;
168
169out_up:
170 up_write(&c->commit_sem);
171out:
172 ubifs_err("commit failed, error %d", err);
173 spin_lock(&c->cs_lock);
174 c->cmt_state = COMMIT_BROKEN;
175 wake_up(&c->cmt_wq);
176 spin_unlock(&c->cs_lock);
177 ubifs_ro_mode(c, err);
178 return err;
179}
180
181/**
182 * run_bg_commit - run background commit if it is needed.
183 * @c: UBIFS file-system description object
184 *
185 * This function runs background commit if it is needed. Returns zero in case
186 * of success and a negative error code in case of failure.
187 */
188static int run_bg_commit(struct ubifs_info *c)
189{
190 spin_lock(&c->cs_lock);
191 /*
192 * Run background commit only if background commit was requested or if
193 * commit is required.
194 */
195 if (c->cmt_state != COMMIT_BACKGROUND &&
196 c->cmt_state != COMMIT_REQUIRED)
197 goto out;
198 spin_unlock(&c->cs_lock);
199
200 down_write(&c->commit_sem);
201 spin_lock(&c->cs_lock);
202 if (c->cmt_state == COMMIT_REQUIRED)
203 c->cmt_state = COMMIT_RUNNING_REQUIRED;
204 else if (c->cmt_state == COMMIT_BACKGROUND)
205 c->cmt_state = COMMIT_RUNNING_BACKGROUND;
206 else
207 goto out_cmt_unlock;
208 spin_unlock(&c->cs_lock);
209
210 return do_commit(c);
211
212out_cmt_unlock:
213 up_write(&c->commit_sem);
214out:
215 spin_unlock(&c->cs_lock);
216 return 0;
217}
218
219/**
220 * ubifs_bg_thread - UBIFS background thread function.
221 * @info: points to the file-system description object
222 *
223 * This function implements various file-system background activities:
224 * o when a write-buffer timer expires it synchronizes the appropriate
225 * write-buffer;
226 * o when the journal is about to be full, it starts in-advance commit.
227 *
228 * Note, other stuff like background garbage collection may be added here in
229 * future.
230 */
231int ubifs_bg_thread(void *info)
232{
233 int err;
234 struct ubifs_info *c = info;
235
236 ubifs_msg("background thread \"%s\" started, PID %d",
237 c->bgt_name, current->pid);
238 set_freezable();
239
240 while (1) {
241 if (kthread_should_stop())
242 break;
243
244 if (try_to_freeze())
245 continue;
246
247 set_current_state(TASK_INTERRUPTIBLE);
248 /* Check if there is something to do */
249 if (!c->need_bgt) {
250 /*
251 * Nothing prevents us from going sleep now and
252 * be never woken up and block the task which
253 * could wait in 'kthread_stop()' forever.
254 */
255 if (kthread_should_stop())
256 break;
257 schedule();
258 continue;
259 } else
260 __set_current_state(TASK_RUNNING);
261
262 c->need_bgt = 0;
263 err = ubifs_bg_wbufs_sync(c);
264 if (err)
265 ubifs_ro_mode(c, err);
266
267 run_bg_commit(c);
268 cond_resched();
269 }
270
271 dbg_msg("background thread \"%s\" stops", c->bgt_name);
272 return 0;
273}
274
275/**
276 * ubifs_commit_required - set commit state to "required".
277 * @c: UBIFS file-system description object
278 *
279 * This function is called if a commit is required but cannot be done from the
280 * calling function, so it is just flagged instead.
281 */
282void ubifs_commit_required(struct ubifs_info *c)
283{
284 spin_lock(&c->cs_lock);
285 switch (c->cmt_state) {
286 case COMMIT_RESTING:
287 case COMMIT_BACKGROUND:
288 dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
289 dbg_cstate(COMMIT_REQUIRED));
290 c->cmt_state = COMMIT_REQUIRED;
291 break;
292 case COMMIT_RUNNING_BACKGROUND:
293 dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
294 dbg_cstate(COMMIT_RUNNING_REQUIRED));
295 c->cmt_state = COMMIT_RUNNING_REQUIRED;
296 break;
297 case COMMIT_REQUIRED:
298 case COMMIT_RUNNING_REQUIRED:
299 case COMMIT_BROKEN:
300 break;
301 }
302 spin_unlock(&c->cs_lock);
303}
304
305/**
306 * ubifs_request_bg_commit - notify the background thread to do a commit.
307 * @c: UBIFS file-system description object
308 *
309 * This function is called if the journal is full enough to make a commit
310 * worthwhile, so background thread is kicked to start it.
311 */
312void ubifs_request_bg_commit(struct ubifs_info *c)
313{
314 spin_lock(&c->cs_lock);
315 if (c->cmt_state == COMMIT_RESTING) {
316 dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
317 dbg_cstate(COMMIT_BACKGROUND));
318 c->cmt_state = COMMIT_BACKGROUND;
319 spin_unlock(&c->cs_lock);
320 ubifs_wake_up_bgt(c);
321 } else
322 spin_unlock(&c->cs_lock);
323}
324
325/**
326 * wait_for_commit - wait for commit.
327 * @c: UBIFS file-system description object
328 *
329 * This function sleeps until the commit operation is no longer running.
330 */
331static int wait_for_commit(struct ubifs_info *c)
332{
333 dbg_cmt("pid %d goes sleep", current->pid);
334
335 /*
336 * The following sleeps if the condition is false, and will be woken
337 * when the commit ends. It is possible, although very unlikely, that we
338 * will wake up and see the subsequent commit running, rather than the
339 * one we were waiting for, and go back to sleep. However, we will be
340 * woken again, so there is no danger of sleeping forever.
341 */
342 wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
343 c->cmt_state != COMMIT_RUNNING_REQUIRED);
344 dbg_cmt("commit finished, pid %d woke up", current->pid);
345 return 0;
346}
347
348/**
349 * ubifs_run_commit - run or wait for commit.
350 * @c: UBIFS file-system description object
351 *
352 * This function runs commit and returns zero in case of success and a negative
353 * error code in case of failure.
354 */
355int ubifs_run_commit(struct ubifs_info *c)
356{
357 int err = 0;
358
359 spin_lock(&c->cs_lock);
360 if (c->cmt_state == COMMIT_BROKEN) {
361 err = -EINVAL;
362 goto out;
363 }
364
365 if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
366 /*
367 * We set the commit state to 'running required' to indicate
368 * that we want it to complete as quickly as possible.
369 */
370 c->cmt_state = COMMIT_RUNNING_REQUIRED;
371
372 if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
373 spin_unlock(&c->cs_lock);
374 return wait_for_commit(c);
375 }
376 spin_unlock(&c->cs_lock);
377
378 /* Ok, the commit is indeed needed */
379
380 down_write(&c->commit_sem);
381 spin_lock(&c->cs_lock);
382 /*
383 * Since we unlocked 'c->cs_lock', the state may have changed, so
384 * re-check it.
385 */
386 if (c->cmt_state == COMMIT_BROKEN) {
387 err = -EINVAL;
388 goto out_cmt_unlock;
389 }
390
391 if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
392 c->cmt_state = COMMIT_RUNNING_REQUIRED;
393
394 if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
395 up_write(&c->commit_sem);
396 spin_unlock(&c->cs_lock);
397 return wait_for_commit(c);
398 }
399 c->cmt_state = COMMIT_RUNNING_REQUIRED;
400 spin_unlock(&c->cs_lock);
401
402 err = do_commit(c);
403 return err;
404
405out_cmt_unlock:
406 up_write(&c->commit_sem);
407out:
408 spin_unlock(&c->cs_lock);
409 return err;
410}
411
412/**
413 * ubifs_gc_should_commit - determine if it is time for GC to run commit.
414 * @c: UBIFS file-system description object
415 *
416 * This function is called by garbage collection to determine if commit should
417 * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
418 * is full enough to start commit, this function returns true. It is not
419 * absolutely necessary to commit yet, but it feels like this should be better
420 * then to keep doing GC. This function returns %1 if GC has to initiate commit
421 * and %0 if not.
422 */
423int ubifs_gc_should_commit(struct ubifs_info *c)
424{
425 int ret = 0;
426
427 spin_lock(&c->cs_lock);
428 if (c->cmt_state == COMMIT_BACKGROUND) {
429 dbg_cmt("commit required now");
430 c->cmt_state = COMMIT_REQUIRED;
431 } else
432 dbg_cmt("commit not requested");
433 if (c->cmt_state == COMMIT_REQUIRED)
434 ret = 1;
435 spin_unlock(&c->cs_lock);
436 return ret;
437}
438
439#ifdef CONFIG_UBIFS_FS_DEBUG
440
441/**
442 * struct idx_node - hold index nodes during index tree traversal.
443 * @list: list
444 * @iip: index in parent (slot number of this indexing node in the parent
445 * indexing node)
446 * @upper_key: all keys in this indexing node have to be less or equivalent to
447 * this key
448 * @idx: index node (8-byte aligned because all node structures must be 8-byte
449 * aligned)
450 */
451struct idx_node {
452 struct list_head list;
453 int iip;
454 union ubifs_key upper_key;
455 struct ubifs_idx_node idx __attribute__((aligned(8)));
456};
457
458/**
459 * dbg_old_index_check_init - get information for the next old index check.
460 * @c: UBIFS file-system description object
461 * @zroot: root of the index
462 *
463 * This function records information about the index that will be needed for the
464 * next old index check i.e. 'dbg_check_old_index()'.
465 *
466 * This function returns %0 on success and a negative error code on failure.
467 */
468int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
469{
470 struct ubifs_idx_node *idx;
471 int lnum, offs, len, err = 0;
472
473 c->old_zroot = *zroot;
474
475 lnum = c->old_zroot.lnum;
476 offs = c->old_zroot.offs;
477 len = c->old_zroot.len;
478
479 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
480 if (!idx)
481 return -ENOMEM;
482
483 err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
484 if (err)
485 goto out;
486
487 c->old_zroot_level = le16_to_cpu(idx->level);
488 c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
489out:
490 kfree(idx);
491 return err;
492}
493
494/**
495 * dbg_check_old_index - check the old copy of the index.
496 * @c: UBIFS file-system description object
497 * @zroot: root of the new index
498 *
499 * In order to be able to recover from an unclean unmount, a complete copy of
500 * the index must exist on flash. This is the "old" index. The commit process
501 * must write the "new" index to flash without overwriting or destroying any
502 * part of the old index. This function is run at commit end in order to check
503 * that the old index does indeed exist completely intact.
504 *
505 * This function returns %0 on success and a negative error code on failure.
506 */
507int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
508{
509 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
510 int first = 1, iip;
511 union ubifs_key lower_key, upper_key, l_key, u_key;
512 unsigned long long uninitialized_var(last_sqnum);
513 struct ubifs_idx_node *idx;
514 struct list_head list;
515 struct idx_node *i;
516 size_t sz;
517
518 if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
519 goto out;
520
521 INIT_LIST_HEAD(&list);
522
523 sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
524 UBIFS_IDX_NODE_SZ;
525
526 /* Start at the old zroot */
527 lnum = c->old_zroot.lnum;
528 offs = c->old_zroot.offs;
529 len = c->old_zroot.len;
530 iip = 0;
531
532 /*
533 * Traverse the index tree preorder depth-first i.e. do a node and then
534 * its subtrees from left to right.
535 */
536 while (1) {
537 struct ubifs_branch *br;
538
539 /* Get the next index node */
540 i = kmalloc(sz, GFP_NOFS);
541 if (!i) {
542 err = -ENOMEM;
543 goto out_free;
544 }
545 i->iip = iip;
546 /* Keep the index nodes on our path in a linked list */
547 list_add_tail(&i->list, &list);
548 /* Read the index node */
549 idx = &i->idx;
550 err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
551 if (err)
552 goto out_free;
553 /* Validate index node */
554 child_cnt = le16_to_cpu(idx->child_cnt);
555 if (child_cnt < 1 || child_cnt > c->fanout) {
556 err = 1;
557 goto out_dump;
558 }
559 if (first) {
560 first = 0;
561 /* Check root level and sqnum */
562 if (le16_to_cpu(idx->level) != c->old_zroot_level) {
563 err = 2;
564 goto out_dump;
565 }
566 if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
567 err = 3;
568 goto out_dump;
569 }
570 /* Set last values as though root had a parent */
571 last_level = le16_to_cpu(idx->level) + 1;
572 last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
573 key_read(c, ubifs_idx_key(c, idx), &lower_key);
574 highest_ino_key(c, &upper_key, INUM_WATERMARK);
575 }
576 key_copy(c, &upper_key, &i->upper_key);
577 if (le16_to_cpu(idx->level) != last_level - 1) {
578 err = 3;
579 goto out_dump;
580 }
581 /*
582 * The index is always written bottom up hence a child's sqnum
583 * is always less than the parents.
584 */
585 if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
586 err = 4;
587 goto out_dump;
588 }
589 /* Check key range */
590 key_read(c, ubifs_idx_key(c, idx), &l_key);
591 br = ubifs_idx_branch(c, idx, child_cnt - 1);
592 key_read(c, &br->key, &u_key);
593 if (keys_cmp(c, &lower_key, &l_key) > 0) {
594 err = 5;
595 goto out_dump;
596 }
597 if (keys_cmp(c, &upper_key, &u_key) < 0) {
598 err = 6;
599 goto out_dump;
600 }
601 if (keys_cmp(c, &upper_key, &u_key) == 0)
602 if (!is_hash_key(c, &u_key)) {
603 err = 7;
604 goto out_dump;
605 }
606 /* Go to next index node */
607 if (le16_to_cpu(idx->level) == 0) {
608 /* At the bottom, so go up until can go right */
609 while (1) {
610 /* Drop the bottom of the list */
611 list_del(&i->list);
612 kfree(i);
613 /* No more list means we are done */
614 if (list_empty(&list))
615 goto out;
616 /* Look at the new bottom */
617 i = list_entry(list.prev, struct idx_node,
618 list);
619 idx = &i->idx;
620 /* Can we go right */
621 if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
622 iip = iip + 1;
623 break;
624 } else
625 /* Nope, so go up again */
626 iip = i->iip;
627 }
628 } else
629 /* Go down left */
630 iip = 0;
631 /*
632 * We have the parent in 'idx' and now we set up for reading the
633 * child pointed to by slot 'iip'.
634 */
635 last_level = le16_to_cpu(idx->level);
636 last_sqnum = le64_to_cpu(idx->ch.sqnum);
637 br = ubifs_idx_branch(c, idx, iip);
638 lnum = le32_to_cpu(br->lnum);
639 offs = le32_to_cpu(br->offs);
640 len = le32_to_cpu(br->len);
641 key_read(c, &br->key, &lower_key);
642 if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
643 br = ubifs_idx_branch(c, idx, iip + 1);
644 key_read(c, &br->key, &upper_key);
645 } else
646 key_copy(c, &i->upper_key, &upper_key);
647 }
648out:
649 err = dbg_old_index_check_init(c, zroot);
650 if (err)
651 goto out_free;
652
653 return 0;
654
655out_dump:
656 dbg_err("dumping index node (iip=%d)", i->iip);
657 dbg_dump_node(c, idx);
658 list_del(&i->list);
659 kfree(i);
660 if (!list_empty(&list)) {
661 i = list_entry(list.prev, struct idx_node, list);
662 dbg_err("dumping parent index node");
663 dbg_dump_node(c, &i->idx);
664 }
665out_free:
666 while (!list_empty(&list)) {
667 i = list_entry(list.next, struct idx_node, list);
668 list_del(&i->list);
669 kfree(i);
670 }
671 ubifs_err("failed, error %d", err);
672 if (err > 0)
673 err = -EINVAL;
674 return err;
675}
676
677#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
new file mode 100644
index 000000000000..5bb51dac3c16
--- /dev/null
+++ b/fs/ubifs/compress.c
@@ -0,0 +1,253 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 * Copyright (C) 2006, 2007 University of Szeged, Hungary
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published by
9 * the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 51
18 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Authors: Adrian Hunter
21 * Artem Bityutskiy (Битюцкий Артём)
22 * Zoltan Sogor
23 */
24
25/*
26 * This file provides a single place to access to compression and
27 * decompression.
28 */
29
30#include <linux/crypto.h>
31#include "ubifs.h"
32
33/* Fake description object for the "none" compressor */
34static struct ubifs_compressor none_compr = {
35 .compr_type = UBIFS_COMPR_NONE,
36 .name = "no compression",
37 .capi_name = "",
38};
39
40#ifdef CONFIG_UBIFS_FS_LZO
41static DEFINE_MUTEX(lzo_mutex);
42
43static struct ubifs_compressor lzo_compr = {
44 .compr_type = UBIFS_COMPR_LZO,
45 .comp_mutex = &lzo_mutex,
46 .name = "LZO",
47 .capi_name = "lzo",
48};
49#else
50static struct ubifs_compressor lzo_compr = {
51 .compr_type = UBIFS_COMPR_LZO,
52 .name = "LZO",
53};
54#endif
55
56#ifdef CONFIG_UBIFS_FS_ZLIB
57static DEFINE_MUTEX(deflate_mutex);
58static DEFINE_MUTEX(inflate_mutex);
59
60static struct ubifs_compressor zlib_compr = {
61 .compr_type = UBIFS_COMPR_ZLIB,
62 .comp_mutex = &deflate_mutex,
63 .decomp_mutex = &inflate_mutex,
64 .name = "zlib",
65 .capi_name = "deflate",
66};
67#else
68static struct ubifs_compressor zlib_compr = {
69 .compr_type = UBIFS_COMPR_ZLIB,
70 .name = "zlib",
71};
72#endif
73
74/* All UBIFS compressors */
75struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
76
77/**
78 * ubifs_compress - compress data.
79 * @in_buf: data to compress
80 * @in_len: length of the data to compress
81 * @out_buf: output buffer where compressed data should be stored
82 * @out_len: output buffer length is returned here
83 * @compr_type: type of compression to use on enter, actually used compression
84 * type on exit
85 *
86 * This function compresses input buffer @in_buf of length @in_len and stores
87 * the result in the output buffer @out_buf and the resulting length in
88 * @out_len. If the input buffer does not compress, it is just copied to the
89 * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
90 * compression error occurred.
91 *
92 * Note, if the input buffer was not compressed, it is copied to the output
93 * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
94 *
95 * This functions returns %0 on success or a negative error code on failure.
96 */
97void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
98 int *compr_type)
99{
100 int err;
101 struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
102
103 if (*compr_type == UBIFS_COMPR_NONE)
104 goto no_compr;
105
106 /* If the input data is small, do not even try to compress it */
107 if (in_len < UBIFS_MIN_COMPR_LEN)
108 goto no_compr;
109
110 if (compr->comp_mutex)
111 mutex_lock(compr->comp_mutex);
112 err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
113 out_len);
114 if (compr->comp_mutex)
115 mutex_unlock(compr->comp_mutex);
116 if (unlikely(err)) {
117 ubifs_warn("cannot compress %d bytes, compressor %s, "
118 "error %d, leave data uncompressed",
119 in_len, compr->name, err);
120 goto no_compr;
121 }
122
123 /*
124 * Presently, we just require that compression results in less data,
125 * rather than any defined minimum compression ratio or amount.
126 */
127 if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
128 goto no_compr;
129
130 return;
131
132no_compr:
133 memcpy(out_buf, in_buf, in_len);
134 *out_len = in_len;
135 *compr_type = UBIFS_COMPR_NONE;
136}
137
138/**
139 * ubifs_decompress - decompress data.
140 * @in_buf: data to decompress
141 * @in_len: length of the data to decompress
142 * @out_buf: output buffer where decompressed data should
143 * @out_len: output length is returned here
144 * @compr_type: type of compression
145 *
146 * This function decompresses data from buffer @in_buf into buffer @out_buf.
147 * The length of the uncompressed data is returned in @out_len. This functions
148 * returns %0 on success or a negative error code on failure.
149 */
150int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
151 int *out_len, int compr_type)
152{
153 int err;
154 struct ubifs_compressor *compr;
155
156 if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
157 ubifs_err("invalid compression type %d", compr_type);
158 return -EINVAL;
159 }
160
161 compr = ubifs_compressors[compr_type];
162
163 if (unlikely(!compr->capi_name)) {
164 ubifs_err("%s compression is not compiled in", compr->name);
165 return -EINVAL;
166 }
167
168 if (compr_type == UBIFS_COMPR_NONE) {
169 memcpy(out_buf, in_buf, in_len);
170 *out_len = in_len;
171 return 0;
172 }
173
174 if (compr->decomp_mutex)
175 mutex_lock(compr->decomp_mutex);
176 err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
177 out_len);
178 if (compr->decomp_mutex)
179 mutex_unlock(compr->decomp_mutex);
180 if (err)
181 ubifs_err("cannot decompress %d bytes, compressor %s, "
182 "error %d", in_len, compr->name, err);
183
184 return err;
185}
186
187/**
188 * compr_init - initialize a compressor.
189 * @compr: compressor description object
190 *
191 * This function initializes the requested compressor and returns zero in case
192 * of success or a negative error code in case of failure.
193 */
194static int __init compr_init(struct ubifs_compressor *compr)
195{
196 if (compr->capi_name) {
197 compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
198 if (IS_ERR(compr->cc)) {
199 ubifs_err("cannot initialize compressor %s, error %ld",
200 compr->name, PTR_ERR(compr->cc));
201 return PTR_ERR(compr->cc);
202 }
203 }
204
205 ubifs_compressors[compr->compr_type] = compr;
206 return 0;
207}
208
209/**
210 * compr_exit - de-initialize a compressor.
211 * @compr: compressor description object
212 */
213static void compr_exit(struct ubifs_compressor *compr)
214{
215 if (compr->capi_name)
216 crypto_free_comp(compr->cc);
217 return;
218}
219
220/**
221 * ubifs_compressors_init - initialize UBIFS compressors.
222 *
223 * This function initializes the compressor which were compiled in. Returns
224 * zero in case of success and a negative error code in case of failure.
225 */
226int __init ubifs_compressors_init(void)
227{
228 int err;
229
230 err = compr_init(&lzo_compr);
231 if (err)
232 return err;
233
234 err = compr_init(&zlib_compr);
235 if (err)
236 goto out_lzo;
237
238 ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
239 return 0;
240
241out_lzo:
242 compr_exit(&lzo_compr);
243 return err;
244}
245
246/**
247 * ubifs_compressors_exit - de-initialize UBIFS compressors.
248 */
249void __exit ubifs_compressors_exit(void)
250{
251 compr_exit(&lzo_compr);
252 compr_exit(&zlib_compr);
253}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
new file mode 100644
index 000000000000..4e3aaeba4eca
--- /dev/null
+++ b/fs/ubifs/debug.c
@@ -0,0 +1,2289 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements most of the debugging stuff which is compiled in only
25 * when it is enabled. But some debugging check functions are implemented in
26 * corresponding subsystem, just because they are closely related and utilize
27 * various local functions of those subsystems.
28 */
29
30#define UBIFS_DBG_PRESERVE_UBI
31
32#include "ubifs.h"
33#include <linux/module.h>
34#include <linux/moduleparam.h>
35
36#ifdef CONFIG_UBIFS_FS_DEBUG
37
38DEFINE_SPINLOCK(dbg_lock);
39
40static char dbg_key_buf0[128];
41static char dbg_key_buf1[128];
42
43unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
44unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
45unsigned int ubifs_tst_flags;
46
47module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
48module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
49module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
50
51MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
52MODULE_PARM_DESC(debug_chks, "Debug check flags");
53MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
54
55static const char *get_key_fmt(int fmt)
56{
57 switch (fmt) {
58 case UBIFS_SIMPLE_KEY_FMT:
59 return "simple";
60 default:
61 return "unknown/invalid format";
62 }
63}
64
65static const char *get_key_hash(int hash)
66{
67 switch (hash) {
68 case UBIFS_KEY_HASH_R5:
69 return "R5";
70 case UBIFS_KEY_HASH_TEST:
71 return "test";
72 default:
73 return "unknown/invalid name hash";
74 }
75}
76
77static const char *get_key_type(int type)
78{
79 switch (type) {
80 case UBIFS_INO_KEY:
81 return "inode";
82 case UBIFS_DENT_KEY:
83 return "direntry";
84 case UBIFS_XENT_KEY:
85 return "xentry";
86 case UBIFS_DATA_KEY:
87 return "data";
88 case UBIFS_TRUN_KEY:
89 return "truncate";
90 default:
91 return "unknown/invalid key";
92 }
93}
94
95static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
96 char *buffer)
97{
98 char *p = buffer;
99 int type = key_type(c, key);
100
101 if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
102 switch (type) {
103 case UBIFS_INO_KEY:
104 sprintf(p, "(%lu, %s)", key_inum(c, key),
105 get_key_type(type));
106 break;
107 case UBIFS_DENT_KEY:
108 case UBIFS_XENT_KEY:
109 sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
110 get_key_type(type), key_hash(c, key));
111 break;
112 case UBIFS_DATA_KEY:
113 sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
114 get_key_type(type), key_block(c, key));
115 break;
116 case UBIFS_TRUN_KEY:
117 sprintf(p, "(%lu, %s)",
118 key_inum(c, key), get_key_type(type));
119 break;
120 default:
121 sprintf(p, "(bad key type: %#08x, %#08x)",
122 key->u32[0], key->u32[1]);
123 }
124 } else
125 sprintf(p, "bad key format %d", c->key_fmt);
126}
127
128const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
129{
130 /* dbg_lock must be held */
131 sprintf_key(c, key, dbg_key_buf0);
132 return dbg_key_buf0;
133}
134
135const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
136{
137 /* dbg_lock must be held */
138 sprintf_key(c, key, dbg_key_buf1);
139 return dbg_key_buf1;
140}
141
142const char *dbg_ntype(int type)
143{
144 switch (type) {
145 case UBIFS_PAD_NODE:
146 return "padding node";
147 case UBIFS_SB_NODE:
148 return "superblock node";
149 case UBIFS_MST_NODE:
150 return "master node";
151 case UBIFS_REF_NODE:
152 return "reference node";
153 case UBIFS_INO_NODE:
154 return "inode node";
155 case UBIFS_DENT_NODE:
156 return "direntry node";
157 case UBIFS_XENT_NODE:
158 return "xentry node";
159 case UBIFS_DATA_NODE:
160 return "data node";
161 case UBIFS_TRUN_NODE:
162 return "truncate node";
163 case UBIFS_IDX_NODE:
164 return "indexing node";
165 case UBIFS_CS_NODE:
166 return "commit start node";
167 case UBIFS_ORPH_NODE:
168 return "orphan node";
169 default:
170 return "unknown node";
171 }
172}
173
174static const char *dbg_gtype(int type)
175{
176 switch (type) {
177 case UBIFS_NO_NODE_GROUP:
178 return "no node group";
179 case UBIFS_IN_NODE_GROUP:
180 return "in node group";
181 case UBIFS_LAST_OF_NODE_GROUP:
182 return "last of node group";
183 default:
184 return "unknown";
185 }
186}
187
188const char *dbg_cstate(int cmt_state)
189{
190 switch (cmt_state) {
191 case COMMIT_RESTING:
192 return "commit resting";
193 case COMMIT_BACKGROUND:
194 return "background commit requested";
195 case COMMIT_REQUIRED:
196 return "commit required";
197 case COMMIT_RUNNING_BACKGROUND:
198 return "BACKGROUND commit running";
199 case COMMIT_RUNNING_REQUIRED:
200 return "commit running and required";
201 case COMMIT_BROKEN:
202 return "broken commit";
203 default:
204 return "unknown commit state";
205 }
206}
207
208static void dump_ch(const struct ubifs_ch *ch)
209{
210 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
211 printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc));
212 printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type,
213 dbg_ntype(ch->node_type));
214 printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type,
215 dbg_gtype(ch->group_type));
216 printk(KERN_DEBUG "\tsqnum %llu\n",
217 (unsigned long long)le64_to_cpu(ch->sqnum));
218 printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len));
219}
220
221void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
222{
223 const struct ubifs_inode *ui = ubifs_inode(inode);
224
225 printk(KERN_DEBUG "inode %lu\n", inode->i_ino);
226 printk(KERN_DEBUG "size %llu\n",
227 (unsigned long long)i_size_read(inode));
228 printk(KERN_DEBUG "nlink %u\n", inode->i_nlink);
229 printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid);
230 printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid);
231 printk(KERN_DEBUG "atime %u.%u\n",
232 (unsigned int)inode->i_atime.tv_sec,
233 (unsigned int)inode->i_atime.tv_nsec);
234 printk(KERN_DEBUG "mtime %u.%u\n",
235 (unsigned int)inode->i_mtime.tv_sec,
236 (unsigned int)inode->i_mtime.tv_nsec);
237 printk(KERN_DEBUG "ctime %u.%u\n",
238 (unsigned int)inode->i_ctime.tv_sec,
239 (unsigned int)inode->i_ctime.tv_nsec);
240 printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
241 printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size);
242 printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt);
243 printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
244 printk(KERN_DEBUG "dirty %u\n", ui->dirty);
245 printk(KERN_DEBUG "xattr %u\n", ui->xattr);
246 printk(KERN_DEBUG "flags %d\n", ui->flags);
247 printk(KERN_DEBUG "compr_type %d\n", ui->compr_type);
248 printk(KERN_DEBUG "data_len %d\n", ui->data_len);
249}
250
251void dbg_dump_node(const struct ubifs_info *c, const void *node)
252{
253 int i, n;
254 union ubifs_key key;
255 const struct ubifs_ch *ch = node;
256
257 if (dbg_failure_mode)
258 return;
259
260 /* If the magic is incorrect, just hexdump the first bytes */
261 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
262 printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
263 print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
264 (void *)node, UBIFS_CH_SZ, 1);
265 return;
266 }
267
268 spin_lock(&dbg_lock);
269 dump_ch(node);
270
271 switch (ch->node_type) {
272 case UBIFS_PAD_NODE:
273 {
274 const struct ubifs_pad_node *pad = node;
275
276 printk(KERN_DEBUG "\tpad_len %u\n",
277 le32_to_cpu(pad->pad_len));
278 break;
279 }
280 case UBIFS_SB_NODE:
281 {
282 const struct ubifs_sb_node *sup = node;
283 unsigned int sup_flags = le32_to_cpu(sup->flags);
284
285 printk(KERN_DEBUG "\tkey_hash %d (%s)\n",
286 (int)sup->key_hash, get_key_hash(sup->key_hash));
287 printk(KERN_DEBUG "\tkey_fmt %d (%s)\n",
288 (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
289 printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
290 printk(KERN_DEBUG "\t big_lpt %u\n",
291 !!(sup_flags & UBIFS_FLG_BIGLPT));
292 printk(KERN_DEBUG "\tmin_io_size %u\n",
293 le32_to_cpu(sup->min_io_size));
294 printk(KERN_DEBUG "\tleb_size %u\n",
295 le32_to_cpu(sup->leb_size));
296 printk(KERN_DEBUG "\tleb_cnt %u\n",
297 le32_to_cpu(sup->leb_cnt));
298 printk(KERN_DEBUG "\tmax_leb_cnt %u\n",
299 le32_to_cpu(sup->max_leb_cnt));
300 printk(KERN_DEBUG "\tmax_bud_bytes %llu\n",
301 (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
302 printk(KERN_DEBUG "\tlog_lebs %u\n",
303 le32_to_cpu(sup->log_lebs));
304 printk(KERN_DEBUG "\tlpt_lebs %u\n",
305 le32_to_cpu(sup->lpt_lebs));
306 printk(KERN_DEBUG "\torph_lebs %u\n",
307 le32_to_cpu(sup->orph_lebs));
308 printk(KERN_DEBUG "\tjhead_cnt %u\n",
309 le32_to_cpu(sup->jhead_cnt));
310 printk(KERN_DEBUG "\tfanout %u\n",
311 le32_to_cpu(sup->fanout));
312 printk(KERN_DEBUG "\tlsave_cnt %u\n",
313 le32_to_cpu(sup->lsave_cnt));
314 printk(KERN_DEBUG "\tdefault_compr %u\n",
315 (int)le16_to_cpu(sup->default_compr));
316 printk(KERN_DEBUG "\trp_size %llu\n",
317 (unsigned long long)le64_to_cpu(sup->rp_size));
318 printk(KERN_DEBUG "\trp_uid %u\n",
319 le32_to_cpu(sup->rp_uid));
320 printk(KERN_DEBUG "\trp_gid %u\n",
321 le32_to_cpu(sup->rp_gid));
322 printk(KERN_DEBUG "\tfmt_version %u\n",
323 le32_to_cpu(sup->fmt_version));
324 printk(KERN_DEBUG "\ttime_gran %u\n",
325 le32_to_cpu(sup->time_gran));
326 printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X"
327 "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
328 sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
329 sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
330 sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
331 sup->uuid[12], sup->uuid[13], sup->uuid[14],
332 sup->uuid[15]);
333 break;
334 }
335 case UBIFS_MST_NODE:
336 {
337 const struct ubifs_mst_node *mst = node;
338
339 printk(KERN_DEBUG "\thighest_inum %llu\n",
340 (unsigned long long)le64_to_cpu(mst->highest_inum));
341 printk(KERN_DEBUG "\tcommit number %llu\n",
342 (unsigned long long)le64_to_cpu(mst->cmt_no));
343 printk(KERN_DEBUG "\tflags %#x\n",
344 le32_to_cpu(mst->flags));
345 printk(KERN_DEBUG "\tlog_lnum %u\n",
346 le32_to_cpu(mst->log_lnum));
347 printk(KERN_DEBUG "\troot_lnum %u\n",
348 le32_to_cpu(mst->root_lnum));
349 printk(KERN_DEBUG "\troot_offs %u\n",
350 le32_to_cpu(mst->root_offs));
351 printk(KERN_DEBUG "\troot_len %u\n",
352 le32_to_cpu(mst->root_len));
353 printk(KERN_DEBUG "\tgc_lnum %u\n",
354 le32_to_cpu(mst->gc_lnum));
355 printk(KERN_DEBUG "\tihead_lnum %u\n",
356 le32_to_cpu(mst->ihead_lnum));
357 printk(KERN_DEBUG "\tihead_offs %u\n",
358 le32_to_cpu(mst->ihead_offs));
359 printk(KERN_DEBUG "\tindex_size %u\n",
360 le32_to_cpu(mst->index_size));
361 printk(KERN_DEBUG "\tlpt_lnum %u\n",
362 le32_to_cpu(mst->lpt_lnum));
363 printk(KERN_DEBUG "\tlpt_offs %u\n",
364 le32_to_cpu(mst->lpt_offs));
365 printk(KERN_DEBUG "\tnhead_lnum %u\n",
366 le32_to_cpu(mst->nhead_lnum));
367 printk(KERN_DEBUG "\tnhead_offs %u\n",
368 le32_to_cpu(mst->nhead_offs));
369 printk(KERN_DEBUG "\tltab_lnum %u\n",
370 le32_to_cpu(mst->ltab_lnum));
371 printk(KERN_DEBUG "\tltab_offs %u\n",
372 le32_to_cpu(mst->ltab_offs));
373 printk(KERN_DEBUG "\tlsave_lnum %u\n",
374 le32_to_cpu(mst->lsave_lnum));
375 printk(KERN_DEBUG "\tlsave_offs %u\n",
376 le32_to_cpu(mst->lsave_offs));
377 printk(KERN_DEBUG "\tlscan_lnum %u\n",
378 le32_to_cpu(mst->lscan_lnum));
379 printk(KERN_DEBUG "\tleb_cnt %u\n",
380 le32_to_cpu(mst->leb_cnt));
381 printk(KERN_DEBUG "\tempty_lebs %u\n",
382 le32_to_cpu(mst->empty_lebs));
383 printk(KERN_DEBUG "\tidx_lebs %u\n",
384 le32_to_cpu(mst->idx_lebs));
385 printk(KERN_DEBUG "\ttotal_free %llu\n",
386 (unsigned long long)le64_to_cpu(mst->total_free));
387 printk(KERN_DEBUG "\ttotal_dirty %llu\n",
388 (unsigned long long)le64_to_cpu(mst->total_dirty));
389 printk(KERN_DEBUG "\ttotal_used %llu\n",
390 (unsigned long long)le64_to_cpu(mst->total_used));
391 printk(KERN_DEBUG "\ttotal_dead %llu\n",
392 (unsigned long long)le64_to_cpu(mst->total_dead));
393 printk(KERN_DEBUG "\ttotal_dark %llu\n",
394 (unsigned long long)le64_to_cpu(mst->total_dark));
395 break;
396 }
397 case UBIFS_REF_NODE:
398 {
399 const struct ubifs_ref_node *ref = node;
400
401 printk(KERN_DEBUG "\tlnum %u\n",
402 le32_to_cpu(ref->lnum));
403 printk(KERN_DEBUG "\toffs %u\n",
404 le32_to_cpu(ref->offs));
405 printk(KERN_DEBUG "\tjhead %u\n",
406 le32_to_cpu(ref->jhead));
407 break;
408 }
409 case UBIFS_INO_NODE:
410 {
411 const struct ubifs_ino_node *ino = node;
412
413 key_read(c, &ino->key, &key);
414 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
415 printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
416 (unsigned long long)le64_to_cpu(ino->creat_sqnum));
417 printk(KERN_DEBUG "\tsize %llu\n",
418 (unsigned long long)le64_to_cpu(ino->size));
419 printk(KERN_DEBUG "\tnlink %u\n",
420 le32_to_cpu(ino->nlink));
421 printk(KERN_DEBUG "\tatime %lld.%u\n",
422 (long long)le64_to_cpu(ino->atime_sec),
423 le32_to_cpu(ino->atime_nsec));
424 printk(KERN_DEBUG "\tmtime %lld.%u\n",
425 (long long)le64_to_cpu(ino->mtime_sec),
426 le32_to_cpu(ino->mtime_nsec));
427 printk(KERN_DEBUG "\tctime %lld.%u\n",
428 (long long)le64_to_cpu(ino->ctime_sec),
429 le32_to_cpu(ino->ctime_nsec));
430 printk(KERN_DEBUG "\tuid %u\n",
431 le32_to_cpu(ino->uid));
432 printk(KERN_DEBUG "\tgid %u\n",
433 le32_to_cpu(ino->gid));
434 printk(KERN_DEBUG "\tmode %u\n",
435 le32_to_cpu(ino->mode));
436 printk(KERN_DEBUG "\tflags %#x\n",
437 le32_to_cpu(ino->flags));
438 printk(KERN_DEBUG "\txattr_cnt %u\n",
439 le32_to_cpu(ino->xattr_cnt));
440 printk(KERN_DEBUG "\txattr_size %u\n",
441 le32_to_cpu(ino->xattr_size));
442 printk(KERN_DEBUG "\txattr_names %u\n",
443 le32_to_cpu(ino->xattr_names));
444 printk(KERN_DEBUG "\tcompr_type %#x\n",
445 (int)le16_to_cpu(ino->compr_type));
446 printk(KERN_DEBUG "\tdata len %u\n",
447 le32_to_cpu(ino->data_len));
448 break;
449 }
450 case UBIFS_DENT_NODE:
451 case UBIFS_XENT_NODE:
452 {
453 const struct ubifs_dent_node *dent = node;
454 int nlen = le16_to_cpu(dent->nlen);
455
456 key_read(c, &dent->key, &key);
457 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
458 printk(KERN_DEBUG "\tinum %llu\n",
459 (unsigned long long)le64_to_cpu(dent->inum));
460 printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
461 printk(KERN_DEBUG "\tnlen %d\n", nlen);
462 printk(KERN_DEBUG "\tname ");
463
464 if (nlen > UBIFS_MAX_NLEN)
465 printk(KERN_DEBUG "(bad name length, not printing, "
466 "bad or corrupted node)");
467 else {
468 for (i = 0; i < nlen && dent->name[i]; i++)
469 printk("%c", dent->name[i]);
470 }
471 printk("\n");
472
473 break;
474 }
475 case UBIFS_DATA_NODE:
476 {
477 const struct ubifs_data_node *dn = node;
478 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
479
480 key_read(c, &dn->key, &key);
481 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
482 printk(KERN_DEBUG "\tsize %u\n",
483 le32_to_cpu(dn->size));
484 printk(KERN_DEBUG "\tcompr_typ %d\n",
485 (int)le16_to_cpu(dn->compr_type));
486 printk(KERN_DEBUG "\tdata size %d\n",
487 dlen);
488 printk(KERN_DEBUG "\tdata:\n");
489 print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
490 (void *)&dn->data, dlen, 0);
491 break;
492 }
493 case UBIFS_TRUN_NODE:
494 {
495 const struct ubifs_trun_node *trun = node;
496
497 printk(KERN_DEBUG "\tinum %u\n",
498 le32_to_cpu(trun->inum));
499 printk(KERN_DEBUG "\told_size %llu\n",
500 (unsigned long long)le64_to_cpu(trun->old_size));
501 printk(KERN_DEBUG "\tnew_size %llu\n",
502 (unsigned long long)le64_to_cpu(trun->new_size));
503 break;
504 }
505 case UBIFS_IDX_NODE:
506 {
507 const struct ubifs_idx_node *idx = node;
508
509 n = le16_to_cpu(idx->child_cnt);
510 printk(KERN_DEBUG "\tchild_cnt %d\n", n);
511 printk(KERN_DEBUG "\tlevel %d\n",
512 (int)le16_to_cpu(idx->level));
513 printk(KERN_DEBUG "\tBranches:\n");
514
515 for (i = 0; i < n && i < c->fanout - 1; i++) {
516 const struct ubifs_branch *br;
517
518 br = ubifs_idx_branch(c, idx, i);
519 key_read(c, &br->key, &key);
520 printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
521 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
522 le32_to_cpu(br->len), DBGKEY(&key));
523 }
524 break;
525 }
526 case UBIFS_CS_NODE:
527 break;
528 case UBIFS_ORPH_NODE:
529 {
530 const struct ubifs_orph_node *orph = node;
531
532 printk(KERN_DEBUG "\tcommit number %llu\n",
533 (unsigned long long)
534 le64_to_cpu(orph->cmt_no) & LLONG_MAX);
535 printk(KERN_DEBUG "\tlast node flag %llu\n",
536 (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
537 n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
539 for (i = 0; i < n; i++)
540 printk(KERN_DEBUG "\t ino %llu\n",
541 le64_to_cpu(orph->inos[i]));
542 break;
543 }
544 default:
545 printk(KERN_DEBUG "node type %d was not recognized\n",
546 (int)ch->node_type);
547 }
548 spin_unlock(&dbg_lock);
549}
550
551void dbg_dump_budget_req(const struct ubifs_budget_req *req)
552{
553 spin_lock(&dbg_lock);
554 printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
555 req->new_ino, req->dirtied_ino);
556 printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n",
557 req->new_ino_d, req->dirtied_ino_d);
558 printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n",
559 req->new_page, req->dirtied_page);
560 printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n",
561 req->new_dent, req->mod_dent);
562 printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth);
563 printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n",
564 req->data_growth, req->dd_growth);
565 spin_unlock(&dbg_lock);
566}
567
568void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
569{
570 spin_lock(&dbg_lock);
571 printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs %d\n",
572 lst->empty_lebs, lst->idx_lebs);
573 printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
574 "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
575 lst->total_dirty);
576 printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
577 "total_dead %lld\n", lst->total_used, lst->total_dark,
578 lst->total_dead);
579 spin_unlock(&dbg_lock);
580}
581
582void dbg_dump_budg(struct ubifs_info *c)
583{
584 int i;
585 struct rb_node *rb;
586 struct ubifs_bud *bud;
587 struct ubifs_gced_idx_leb *idx_gc;
588
589 spin_lock(&dbg_lock);
590 printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
591 "budg_dd_growth %lld, budg_idx_growth %lld\n",
592 c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
593 printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
594 "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
595 c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
596 c->freeable_cnt);
597 printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
598 "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
599 c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
600 printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
601 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
602 atomic_long_read(&c->dirty_zn_cnt),
603 atomic_long_read(&c->clean_zn_cnt));
604 printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
605 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
606 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
607 c->gc_lnum, c->ihead_lnum);
608 for (i = 0; i < c->jhead_cnt; i++)
609 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
610 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
611 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
612 bud = rb_entry(rb, struct ubifs_bud, rb);
613 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
614 }
615 list_for_each_entry(bud, &c->old_buds, list)
616 printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
617 list_for_each_entry(idx_gc, &c->idx_gc, list)
618 printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
619 idx_gc->lnum, idx_gc->unmap);
620 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
621 spin_unlock(&dbg_lock);
622}
623
624void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
625{
626 printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
627 "flags %#x\n", lp->lnum, lp->free, lp->dirty,
628 c->leb_size - lp->free - lp->dirty, lp->flags);
629}
630
631void dbg_dump_lprops(struct ubifs_info *c)
632{
633 int lnum, err;
634 struct ubifs_lprops lp;
635 struct ubifs_lp_stats lst;
636
637 printk(KERN_DEBUG "Dumping LEB properties\n");
638 ubifs_get_lp_stats(c, &lst);
639 dbg_dump_lstats(&lst);
640
641 for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
642 err = ubifs_read_one_lp(c, lnum, &lp);
643 if (err)
644 ubifs_err("cannot read lprops for LEB %d", lnum);
645
646 dbg_dump_lprop(c, &lp);
647 }
648}
649
650void dbg_dump_leb(const struct ubifs_info *c, int lnum)
651{
652 struct ubifs_scan_leb *sleb;
653 struct ubifs_scan_node *snod;
654
655 if (dbg_failure_mode)
656 return;
657
658 printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
659
660 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
661 if (IS_ERR(sleb)) {
662 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
663 return;
664 }
665
666 printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
667 sleb->nodes_cnt, sleb->endpt);
668
669 list_for_each_entry(snod, &sleb->nodes, list) {
670 cond_resched();
671 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
672 snod->offs, snod->len);
673 dbg_dump_node(c, snod->node);
674 }
675
676 ubifs_scan_destroy(sleb);
677 return;
678}
679
680void dbg_dump_znode(const struct ubifs_info *c,
681 const struct ubifs_znode *znode)
682{
683 int n;
684 const struct ubifs_zbranch *zbr;
685
686 spin_lock(&dbg_lock);
687 if (znode->parent)
688 zbr = &znode->parent->zbranch[znode->iip];
689 else
690 zbr = &c->zroot;
691
692 printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
693 " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
694 zbr->len, znode->parent, znode->iip, znode->level,
695 znode->child_cnt, znode->flags);
696
697 if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
698 spin_unlock(&dbg_lock);
699 return;
700 }
701
702 printk(KERN_DEBUG "zbranches:\n");
703 for (n = 0; n < znode->child_cnt; n++) {
704 zbr = &znode->zbranch[n];
705 if (znode->level > 0)
706 printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
707 "%s\n", n, zbr->znode, zbr->lnum,
708 zbr->offs, zbr->len,
709 DBGKEY(&zbr->key));
710 else
711 printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
712 "%s\n", n, zbr->znode, zbr->lnum,
713 zbr->offs, zbr->len,
714 DBGKEY(&zbr->key));
715 }
716 spin_unlock(&dbg_lock);
717}
718
719void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
720{
721 int i;
722
723 printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
724 cat, heap->cnt);
725 for (i = 0; i < heap->cnt; i++) {
726 struct ubifs_lprops *lprops = heap->arr[i];
727
728 printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
729 "flags %d\n", i, lprops->lnum, lprops->hpos,
730 lprops->free, lprops->dirty, lprops->flags);
731 }
732}
733
734void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
735 struct ubifs_nnode *parent, int iip)
736{
737 int i;
738
739 printk(KERN_DEBUG "Dumping pnode:\n");
740 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
741 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
742 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
743 pnode->flags, iip, pnode->level, pnode->num);
744 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
745 struct ubifs_lprops *lp = &pnode->lprops[i];
746
747 printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
748 i, lp->free, lp->dirty, lp->flags, lp->lnum);
749 }
750}
751
752void dbg_dump_tnc(struct ubifs_info *c)
753{
754 struct ubifs_znode *znode;
755 int level;
756
757 printk(KERN_DEBUG "\n");
758 printk(KERN_DEBUG "Dumping the TNC tree\n");
759 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
760 level = znode->level;
761 printk(KERN_DEBUG "== Level %d ==\n", level);
762 while (znode) {
763 if (level != znode->level) {
764 level = znode->level;
765 printk(KERN_DEBUG "== Level %d ==\n", level);
766 }
767 dbg_dump_znode(c, znode);
768 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
769 }
770
771 printk(KERN_DEBUG "\n");
772}
773
774static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
775 void *priv)
776{
777 dbg_dump_znode(c, znode);
778 return 0;
779}
780
781/**
782 * dbg_dump_index - dump the on-flash index.
783 * @c: UBIFS file-system description object
784 *
785 * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
786 * which dumps only in-memory znodes and does not read znodes which from flash.
787 */
788void dbg_dump_index(struct ubifs_info *c)
789{
790 dbg_walk_index(c, NULL, dump_znode, NULL);
791}
792
793/**
794 * dbg_check_synced_i_size - check synchronized inode size.
795 * @inode: inode to check
796 *
797 * If inode is clean, synchronized inode size has to be equivalent to current
798 * inode size. This function has to be called only for locked inodes (@i_mutex
799 * has to be locked). Returns %0 if synchronized inode size if correct, and
800 * %-EINVAL if not.
801 */
802int dbg_check_synced_i_size(struct inode *inode)
803{
804 int err = 0;
805 struct ubifs_inode *ui = ubifs_inode(inode);
806
807 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
808 return 0;
809 if (!S_ISREG(inode->i_mode))
810 return 0;
811
812 mutex_lock(&ui->ui_mutex);
813 spin_lock(&ui->ui_lock);
814 if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
815 ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
816 "is clean", ui->ui_size, ui->synced_i_size);
817 ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
818 inode->i_mode, i_size_read(inode));
819 dbg_dump_stack();
820 err = -EINVAL;
821 }
822 spin_unlock(&ui->ui_lock);
823 mutex_unlock(&ui->ui_mutex);
824 return err;
825}
826
827/*
828 * dbg_check_dir - check directory inode size and link count.
829 * @c: UBIFS file-system description object
830 * @dir: the directory to calculate size for
831 * @size: the result is returned here
832 *
833 * This function makes sure that directory size and link count are correct.
834 * Returns zero in case of success and a negative error code in case of
835 * failure.
836 *
837 * Note, it is good idea to make sure the @dir->i_mutex is locked before
838 * calling this function.
839 */
840int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
841{
842 unsigned int nlink = 2;
843 union ubifs_key key;
844 struct ubifs_dent_node *dent, *pdent = NULL;
845 struct qstr nm = { .name = NULL };
846 loff_t size = UBIFS_INO_NODE_SZ;
847
848 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
849 return 0;
850
851 if (!S_ISDIR(dir->i_mode))
852 return 0;
853
854 lowest_dent_key(c, &key, dir->i_ino);
855 while (1) {
856 int err;
857
858 dent = ubifs_tnc_next_ent(c, &key, &nm);
859 if (IS_ERR(dent)) {
860 err = PTR_ERR(dent);
861 if (err == -ENOENT)
862 break;
863 return err;
864 }
865
866 nm.name = dent->name;
867 nm.len = le16_to_cpu(dent->nlen);
868 size += CALC_DENT_SIZE(nm.len);
869 if (dent->type == UBIFS_ITYPE_DIR)
870 nlink += 1;
871 kfree(pdent);
872 pdent = dent;
873 key_read(c, &dent->key, &key);
874 }
875 kfree(pdent);
876
877 if (i_size_read(dir) != size) {
878 ubifs_err("directory inode %lu has size %llu, "
879 "but calculated size is %llu", dir->i_ino,
880 (unsigned long long)i_size_read(dir),
881 (unsigned long long)size);
882 dump_stack();
883 return -EINVAL;
884 }
885 if (dir->i_nlink != nlink) {
886 ubifs_err("directory inode %lu has nlink %u, but calculated "
887 "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
888 dump_stack();
889 return -EINVAL;
890 }
891
892 return 0;
893}
894
895/**
896 * dbg_check_key_order - make sure that colliding keys are properly ordered.
897 * @c: UBIFS file-system description object
898 * @zbr1: first zbranch
899 * @zbr2: following zbranch
900 *
901 * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
902 * names of the direntries/xentries which are referred by the keys. This
903 * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
904 * sure the name of direntry/xentry referred by @zbr1 is less than
905 * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
906 * and a negative error code in case of failure.
907 */
908static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
909 struct ubifs_zbranch *zbr2)
910{
911 int err, nlen1, nlen2, cmp;
912 struct ubifs_dent_node *dent1, *dent2;
913 union ubifs_key key;
914
915 ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
916 dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
917 if (!dent1)
918 return -ENOMEM;
919 dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
920 if (!dent2) {
921 err = -ENOMEM;
922 goto out_free;
923 }
924
925 err = ubifs_tnc_read_node(c, zbr1, dent1);
926 if (err)
927 goto out_free;
928 err = ubifs_validate_entry(c, dent1);
929 if (err)
930 goto out_free;
931
932 err = ubifs_tnc_read_node(c, zbr2, dent2);
933 if (err)
934 goto out_free;
935 err = ubifs_validate_entry(c, dent2);
936 if (err)
937 goto out_free;
938
939 /* Make sure node keys are the same as in zbranch */
940 err = 1;
941 key_read(c, &dent1->key, &key);
942 if (keys_cmp(c, &zbr1->key, &key)) {
943 dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
944 zbr1->offs, DBGKEY(&key));
945 dbg_err("but it should have key %s according to tnc",
946 DBGKEY(&zbr1->key));
947 dbg_dump_node(c, dent1);
948 goto out_free;
949 }
950
951 key_read(c, &dent2->key, &key);
952 if (keys_cmp(c, &zbr2->key, &key)) {
953 dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
954 zbr1->offs, DBGKEY(&key));
955 dbg_err("but it should have key %s according to tnc",
956 DBGKEY(&zbr2->key));
957 dbg_dump_node(c, dent2);
958 goto out_free;
959 }
960
961 nlen1 = le16_to_cpu(dent1->nlen);
962 nlen2 = le16_to_cpu(dent2->nlen);
963
964 cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
965 if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
966 err = 0;
967 goto out_free;
968 }
969 if (cmp == 0 && nlen1 == nlen2)
970 dbg_err("2 xent/dent nodes with the same name");
971 else
972 dbg_err("bad order of colliding key %s",
973 DBGKEY(&key));
974
975 dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
976 dbg_dump_node(c, dent1);
977 dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
978 dbg_dump_node(c, dent2);
979
980out_free:
981 kfree(dent2);
982 kfree(dent1);
983 return err;
984}
985
986/**
987 * dbg_check_znode - check if znode is all right.
988 * @c: UBIFS file-system description object
989 * @zbr: zbranch which points to this znode
990 *
991 * This function makes sure that znode referred to by @zbr is all right.
992 * Returns zero if it is, and %-EINVAL if it is not.
993 */
994static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
995{
996 struct ubifs_znode *znode = zbr->znode;
997 struct ubifs_znode *zp = znode->parent;
998 int n, err, cmp;
999
1000 if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
1001 err = 1;
1002 goto out;
1003 }
1004 if (znode->level < 0) {
1005 err = 2;
1006 goto out;
1007 }
1008 if (znode->iip < 0 || znode->iip >= c->fanout) {
1009 err = 3;
1010 goto out;
1011 }
1012
1013 if (zbr->len == 0)
1014 /* Only dirty zbranch may have no on-flash nodes */
1015 if (!ubifs_zn_dirty(znode)) {
1016 err = 4;
1017 goto out;
1018 }
1019
1020 if (ubifs_zn_dirty(znode)) {
1021 /*
1022 * If znode is dirty, its parent has to be dirty as well. The
1023 * order of the operation is important, so we have to have
1024 * memory barriers.
1025 */
1026 smp_mb();
1027 if (zp && !ubifs_zn_dirty(zp)) {
1028 /*
1029 * The dirty flag is atomic and is cleared outside the
1030 * TNC mutex, so znode's dirty flag may now have
1031 * been cleared. The child is always cleared before the
1032 * parent, so we just need to check again.
1033 */
1034 smp_mb();
1035 if (ubifs_zn_dirty(znode)) {
1036 err = 5;
1037 goto out;
1038 }
1039 }
1040 }
1041
1042 if (zp) {
1043 const union ubifs_key *min, *max;
1044
1045 if (znode->level != zp->level - 1) {
1046 err = 6;
1047 goto out;
1048 }
1049
1050 /* Make sure the 'parent' pointer in our znode is correct */
1051 err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
1052 if (!err) {
1053 /* This zbranch does not exist in the parent */
1054 err = 7;
1055 goto out;
1056 }
1057
1058 if (znode->iip >= zp->child_cnt) {
1059 err = 8;
1060 goto out;
1061 }
1062
1063 if (znode->iip != n) {
1064 /* This may happen only in case of collisions */
1065 if (keys_cmp(c, &zp->zbranch[n].key,
1066 &zp->zbranch[znode->iip].key)) {
1067 err = 9;
1068 goto out;
1069 }
1070 n = znode->iip;
1071 }
1072
1073 /*
1074 * Make sure that the first key in our znode is greater than or
1075 * equal to the key in the pointing zbranch.
1076 */
1077 min = &zbr->key;
1078 cmp = keys_cmp(c, min, &znode->zbranch[0].key);
1079 if (cmp == 1) {
1080 err = 10;
1081 goto out;
1082 }
1083
1084 if (n + 1 < zp->child_cnt) {
1085 max = &zp->zbranch[n + 1].key;
1086
1087 /*
1088 * Make sure the last key in our znode is less or
1089 * equivalent than the the key in zbranch which goes
1090 * after our pointing zbranch.
1091 */
1092 cmp = keys_cmp(c, max,
1093 &znode->zbranch[znode->child_cnt - 1].key);
1094 if (cmp == -1) {
1095 err = 11;
1096 goto out;
1097 }
1098 }
1099 } else {
1100 /* This may only be root znode */
1101 if (zbr != &c->zroot) {
1102 err = 12;
1103 goto out;
1104 }
1105 }
1106
1107 /*
1108 * Make sure that next key is greater or equivalent then the previous
1109 * one.
1110 */
1111 for (n = 1; n < znode->child_cnt; n++) {
1112 cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
1113 &znode->zbranch[n].key);
1114 if (cmp > 0) {
1115 err = 13;
1116 goto out;
1117 }
1118 if (cmp == 0) {
1119 /* This can only be keys with colliding hash */
1120 if (!is_hash_key(c, &znode->zbranch[n].key)) {
1121 err = 14;
1122 goto out;
1123 }
1124
1125 if (znode->level != 0 || c->replaying)
1126 continue;
1127
1128 /*
1129 * Colliding keys should follow binary order of
1130 * corresponding xentry/dentry names.
1131 */
1132 err = dbg_check_key_order(c, &znode->zbranch[n - 1],
1133 &znode->zbranch[n]);
1134 if (err < 0)
1135 return err;
1136 if (err) {
1137 err = 15;
1138 goto out;
1139 }
1140 }
1141 }
1142
1143 for (n = 0; n < znode->child_cnt; n++) {
1144 if (!znode->zbranch[n].znode &&
1145 (znode->zbranch[n].lnum == 0 ||
1146 znode->zbranch[n].len == 0)) {
1147 err = 16;
1148 goto out;
1149 }
1150
1151 if (znode->zbranch[n].lnum != 0 &&
1152 znode->zbranch[n].len == 0) {
1153 err = 17;
1154 goto out;
1155 }
1156
1157 if (znode->zbranch[n].lnum == 0 &&
1158 znode->zbranch[n].len != 0) {
1159 err = 18;
1160 goto out;
1161 }
1162
1163 if (znode->zbranch[n].lnum == 0 &&
1164 znode->zbranch[n].offs != 0) {
1165 err = 19;
1166 goto out;
1167 }
1168
1169 if (znode->level != 0 && znode->zbranch[n].znode)
1170 if (znode->zbranch[n].znode->parent != znode) {
1171 err = 20;
1172 goto out;
1173 }
1174 }
1175
1176 return 0;
1177
1178out:
1179 ubifs_err("failed, error %d", err);
1180 ubifs_msg("dump of the znode");
1181 dbg_dump_znode(c, znode);
1182 if (zp) {
1183 ubifs_msg("dump of the parent znode");
1184 dbg_dump_znode(c, zp);
1185 }
1186 dump_stack();
1187 return -EINVAL;
1188}
1189
1190/**
1191 * dbg_check_tnc - check TNC tree.
1192 * @c: UBIFS file-system description object
1193 * @extra: do extra checks that are possible at start commit
1194 *
1195 * This function traverses whole TNC tree and checks every znode. Returns zero
1196 * if everything is all right and %-EINVAL if something is wrong with TNC.
1197 */
1198int dbg_check_tnc(struct ubifs_info *c, int extra)
1199{
1200 struct ubifs_znode *znode;
1201 long clean_cnt = 0, dirty_cnt = 0;
1202 int err, last;
1203
1204 if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
1205 return 0;
1206
1207 ubifs_assert(mutex_is_locked(&c->tnc_mutex));
1208 if (!c->zroot.znode)
1209 return 0;
1210
1211 znode = ubifs_tnc_postorder_first(c->zroot.znode);
1212 while (1) {
1213 struct ubifs_znode *prev;
1214 struct ubifs_zbranch *zbr;
1215
1216 if (!znode->parent)
1217 zbr = &c->zroot;
1218 else
1219 zbr = &znode->parent->zbranch[znode->iip];
1220
1221 err = dbg_check_znode(c, zbr);
1222 if (err)
1223 return err;
1224
1225 if (extra) {
1226 if (ubifs_zn_dirty(znode))
1227 dirty_cnt += 1;
1228 else
1229 clean_cnt += 1;
1230 }
1231
1232 prev = znode;
1233 znode = ubifs_tnc_postorder_next(znode);
1234 if (!znode)
1235 break;
1236
1237 /*
1238 * If the last key of this znode is equivalent to the first key
1239 * of the next znode (collision), then check order of the keys.
1240 */
1241 last = prev->child_cnt - 1;
1242 if (prev->level == 0 && znode->level == 0 && !c->replaying &&
1243 !keys_cmp(c, &prev->zbranch[last].key,
1244 &znode->zbranch[0].key)) {
1245 err = dbg_check_key_order(c, &prev->zbranch[last],
1246 &znode->zbranch[0]);
1247 if (err < 0)
1248 return err;
1249 if (err) {
1250 ubifs_msg("first znode");
1251 dbg_dump_znode(c, prev);
1252 ubifs_msg("second znode");
1253 dbg_dump_znode(c, znode);
1254 return -EINVAL;
1255 }
1256 }
1257 }
1258
1259 if (extra) {
1260 if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
1261 ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
1262 atomic_long_read(&c->clean_zn_cnt),
1263 clean_cnt);
1264 return -EINVAL;
1265 }
1266 if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
1267 ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
1268 atomic_long_read(&c->dirty_zn_cnt),
1269 dirty_cnt);
1270 return -EINVAL;
1271 }
1272 }
1273
1274 return 0;
1275}
1276
1277/**
1278 * dbg_walk_index - walk the on-flash index.
1279 * @c: UBIFS file-system description object
1280 * @leaf_cb: called for each leaf node
1281 * @znode_cb: called for each indexing node
1282 * @priv: private date which is passed to callbacks
1283 *
1284 * This function walks the UBIFS index and calls the @leaf_cb for each leaf
1285 * node and @znode_cb for each indexing node. Returns zero in case of success
1286 * and a negative error code in case of failure.
1287 *
1288 * It would be better if this function removed every znode it pulled to into
1289 * the TNC, so that the behavior more closely matched the non-debugging
1290 * behavior.
1291 */
1292int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
1293 dbg_znode_callback znode_cb, void *priv)
1294{
1295 int err;
1296 struct ubifs_zbranch *zbr;
1297 struct ubifs_znode *znode, *child;
1298
1299 mutex_lock(&c->tnc_mutex);
1300 /* If the root indexing node is not in TNC - pull it */
1301 if (!c->zroot.znode) {
1302 c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
1303 if (IS_ERR(c->zroot.znode)) {
1304 err = PTR_ERR(c->zroot.znode);
1305 c->zroot.znode = NULL;
1306 goto out_unlock;
1307 }
1308 }
1309
1310 /*
1311 * We are going to traverse the indexing tree in the postorder manner.
1312 * Go down and find the leftmost indexing node where we are going to
1313 * start from.
1314 */
1315 znode = c->zroot.znode;
1316 while (znode->level > 0) {
1317 zbr = &znode->zbranch[0];
1318 child = zbr->znode;
1319 if (!child) {
1320 child = ubifs_load_znode(c, zbr, znode, 0);
1321 if (IS_ERR(child)) {
1322 err = PTR_ERR(child);
1323 goto out_unlock;
1324 }
1325 zbr->znode = child;
1326 }
1327
1328 znode = child;
1329 }
1330
1331 /* Iterate over all indexing nodes */
1332 while (1) {
1333 int idx;
1334
1335 cond_resched();
1336
1337 if (znode_cb) {
1338 err = znode_cb(c, znode, priv);
1339 if (err) {
1340 ubifs_err("znode checking function returned "
1341 "error %d", err);
1342 dbg_dump_znode(c, znode);
1343 goto out_dump;
1344 }
1345 }
1346 if (leaf_cb && znode->level == 0) {
1347 for (idx = 0; idx < znode->child_cnt; idx++) {
1348 zbr = &znode->zbranch[idx];
1349 err = leaf_cb(c, zbr, priv);
1350 if (err) {
1351 ubifs_err("leaf checking function "
1352 "returned error %d, for leaf "
1353 "at LEB %d:%d",
1354 err, zbr->lnum, zbr->offs);
1355 goto out_dump;
1356 }
1357 }
1358 }
1359
1360 if (!znode->parent)
1361 break;
1362
1363 idx = znode->iip + 1;
1364 znode = znode->parent;
1365 if (idx < znode->child_cnt) {
1366 /* Switch to the next index in the parent */
1367 zbr = &znode->zbranch[idx];
1368 child = zbr->znode;
1369 if (!child) {
1370 child = ubifs_load_znode(c, zbr, znode, idx);
1371 if (IS_ERR(child)) {
1372 err = PTR_ERR(child);
1373 goto out_unlock;
1374 }
1375 zbr->znode = child;
1376 }
1377 znode = child;
1378 } else
1379 /*
1380 * This is the last child, switch to the parent and
1381 * continue.
1382 */
1383 continue;
1384
1385 /* Go to the lowest leftmost znode in the new sub-tree */
1386 while (znode->level > 0) {
1387 zbr = &znode->zbranch[0];
1388 child = zbr->znode;
1389 if (!child) {
1390 child = ubifs_load_znode(c, zbr, znode, 0);
1391 if (IS_ERR(child)) {
1392 err = PTR_ERR(child);
1393 goto out_unlock;
1394 }
1395 zbr->znode = child;
1396 }
1397 znode = child;
1398 }
1399 }
1400
1401 mutex_unlock(&c->tnc_mutex);
1402 return 0;
1403
1404out_dump:
1405 if (znode->parent)
1406 zbr = &znode->parent->zbranch[znode->iip];
1407 else
1408 zbr = &c->zroot;
1409 ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
1410 dbg_dump_znode(c, znode);
1411out_unlock:
1412 mutex_unlock(&c->tnc_mutex);
1413 return err;
1414}
1415
1416/**
1417 * add_size - add znode size to partially calculated index size.
1418 * @c: UBIFS file-system description object
1419 * @znode: znode to add size for
1420 * @priv: partially calculated index size
1421 *
1422 * This is a helper function for 'dbg_check_idx_size()' which is called for
1423 * every indexing node and adds its size to the 'long long' variable pointed to
1424 * by @priv.
1425 */
1426static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
1427{
1428 long long *idx_size = priv;
1429 int add;
1430
1431 add = ubifs_idx_node_sz(c, znode->child_cnt);
1432 add = ALIGN(add, 8);
1433 *idx_size += add;
1434 return 0;
1435}
1436
1437/**
1438 * dbg_check_idx_size - check index size.
1439 * @c: UBIFS file-system description object
1440 * @idx_size: size to check
1441 *
1442 * This function walks the UBIFS index, calculates its size and checks that the
1443 * size is equivalent to @idx_size. Returns zero in case of success and a
1444 * negative error code in case of failure.
1445 */
1446int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
1447{
1448 int err;
1449 long long calc = 0;
1450
1451 if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
1452 return 0;
1453
1454 err = dbg_walk_index(c, NULL, add_size, &calc);
1455 if (err) {
1456 ubifs_err("error %d while walking the index", err);
1457 return err;
1458 }
1459
1460 if (calc != idx_size) {
1461 ubifs_err("index size check failed: calculated size is %lld, "
1462 "should be %lld", calc, idx_size);
1463 dump_stack();
1464 return -EINVAL;
1465 }
1466
1467 return 0;
1468}
1469
1470/**
1471 * struct fsck_inode - information about an inode used when checking the file-system.
1472 * @rb: link in the RB-tree of inodes
1473 * @inum: inode number
1474 * @mode: inode type, permissions, etc
1475 * @nlink: inode link count
1476 * @xattr_cnt: count of extended attributes
1477 * @references: how many directory/xattr entries refer this inode (calculated
1478 * while walking the index)
1479 * @calc_cnt: for directory inode count of child directories
1480 * @size: inode size (read from on-flash inode)
1481 * @xattr_sz: summary size of all extended attributes (read from on-flash
1482 * inode)
1483 * @calc_sz: for directories calculated directory size
1484 * @calc_xcnt: count of extended attributes
1485 * @calc_xsz: calculated summary size of all extended attributes
1486 * @xattr_nms: sum of lengths of all extended attribute names belonging to this
1487 * inode (read from on-flash inode)
1488 * @calc_xnms: calculated sum of lengths of all extended attribute names
1489 */
1490struct fsck_inode {
1491 struct rb_node rb;
1492 ino_t inum;
1493 umode_t mode;
1494 unsigned int nlink;
1495 unsigned int xattr_cnt;
1496 int references;
1497 int calc_cnt;
1498 long long size;
1499 unsigned int xattr_sz;
1500 long long calc_sz;
1501 long long calc_xcnt;
1502 long long calc_xsz;
1503 unsigned int xattr_nms;
1504 long long calc_xnms;
1505};
1506
1507/**
1508 * struct fsck_data - private FS checking information.
1509 * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
1510 */
1511struct fsck_data {
1512 struct rb_root inodes;
1513};
1514
1515/**
1516 * add_inode - add inode information to RB-tree of inodes.
1517 * @c: UBIFS file-system description object
1518 * @fsckd: FS checking information
1519 * @ino: raw UBIFS inode to add
1520 *
1521 * This is a helper function for 'check_leaf()' which adds information about
1522 * inode @ino to the RB-tree of inodes. Returns inode information pointer in
1523 * case of success and a negative error code in case of failure.
1524 */
1525static struct fsck_inode *add_inode(struct ubifs_info *c,
1526 struct fsck_data *fsckd,
1527 struct ubifs_ino_node *ino)
1528{
1529 struct rb_node **p, *parent = NULL;
1530 struct fsck_inode *fscki;
1531 ino_t inum = key_inum_flash(c, &ino->key);
1532
1533 p = &fsckd->inodes.rb_node;
1534 while (*p) {
1535 parent = *p;
1536 fscki = rb_entry(parent, struct fsck_inode, rb);
1537 if (inum < fscki->inum)
1538 p = &(*p)->rb_left;
1539 else if (inum > fscki->inum)
1540 p = &(*p)->rb_right;
1541 else
1542 return fscki;
1543 }
1544
1545 if (inum > c->highest_inum) {
1546 ubifs_err("too high inode number, max. is %lu",
1547 c->highest_inum);
1548 return ERR_PTR(-EINVAL);
1549 }
1550
1551 fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
1552 if (!fscki)
1553 return ERR_PTR(-ENOMEM);
1554
1555 fscki->inum = inum;
1556 fscki->nlink = le32_to_cpu(ino->nlink);
1557 fscki->size = le64_to_cpu(ino->size);
1558 fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
1559 fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
1560 fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
1561 fscki->mode = le32_to_cpu(ino->mode);
1562 if (S_ISDIR(fscki->mode)) {
1563 fscki->calc_sz = UBIFS_INO_NODE_SZ;
1564 fscki->calc_cnt = 2;
1565 }
1566 rb_link_node(&fscki->rb, parent, p);
1567 rb_insert_color(&fscki->rb, &fsckd->inodes);
1568 return fscki;
1569}
1570
1571/**
1572 * search_inode - search inode in the RB-tree of inodes.
1573 * @fsckd: FS checking information
1574 * @inum: inode number to search
1575 *
1576 * This is a helper function for 'check_leaf()' which searches inode @inum in
1577 * the RB-tree of inodes and returns an inode information pointer or %NULL if
1578 * the inode was not found.
1579 */
1580static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
1581{
1582 struct rb_node *p;
1583 struct fsck_inode *fscki;
1584
1585 p = fsckd->inodes.rb_node;
1586 while (p) {
1587 fscki = rb_entry(p, struct fsck_inode, rb);
1588 if (inum < fscki->inum)
1589 p = p->rb_left;
1590 else if (inum > fscki->inum)
1591 p = p->rb_right;
1592 else
1593 return fscki;
1594 }
1595 return NULL;
1596}
1597
1598/**
1599 * read_add_inode - read inode node and add it to RB-tree of inodes.
1600 * @c: UBIFS file-system description object
1601 * @fsckd: FS checking information
1602 * @inum: inode number to read
1603 *
1604 * This is a helper function for 'check_leaf()' which finds inode node @inum in
1605 * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
1606 * information pointer in case of success and a negative error code in case of
1607 * failure.
1608 */
1609static struct fsck_inode *read_add_inode(struct ubifs_info *c,
1610 struct fsck_data *fsckd, ino_t inum)
1611{
1612 int n, err;
1613 union ubifs_key key;
1614 struct ubifs_znode *znode;
1615 struct ubifs_zbranch *zbr;
1616 struct ubifs_ino_node *ino;
1617 struct fsck_inode *fscki;
1618
1619 fscki = search_inode(fsckd, inum);
1620 if (fscki)
1621 return fscki;
1622
1623 ino_key_init(c, &key, inum);
1624 err = ubifs_lookup_level0(c, &key, &znode, &n);
1625 if (!err) {
1626 ubifs_err("inode %lu not found in index", inum);
1627 return ERR_PTR(-ENOENT);
1628 } else if (err < 0) {
1629 ubifs_err("error %d while looking up inode %lu", err, inum);
1630 return ERR_PTR(err);
1631 }
1632
1633 zbr = &znode->zbranch[n];
1634 if (zbr->len < UBIFS_INO_NODE_SZ) {
1635 ubifs_err("bad node %lu node length %d", inum, zbr->len);
1636 return ERR_PTR(-EINVAL);
1637 }
1638
1639 ino = kmalloc(zbr->len, GFP_NOFS);
1640 if (!ino)
1641 return ERR_PTR(-ENOMEM);
1642
1643 err = ubifs_tnc_read_node(c, zbr, ino);
1644 if (err) {
1645 ubifs_err("cannot read inode node at LEB %d:%d, error %d",
1646 zbr->lnum, zbr->offs, err);
1647 kfree(ino);
1648 return ERR_PTR(err);
1649 }
1650
1651 fscki = add_inode(c, fsckd, ino);
1652 kfree(ino);
1653 if (IS_ERR(fscki)) {
1654 ubifs_err("error %ld while adding inode %lu node",
1655 PTR_ERR(fscki), inum);
1656 return fscki;
1657 }
1658
1659 return fscki;
1660}
1661
1662/**
1663 * check_leaf - check leaf node.
1664 * @c: UBIFS file-system description object
1665 * @zbr: zbranch of the leaf node to check
1666 * @priv: FS checking information
1667 *
1668 * This is a helper function for 'dbg_check_filesystem()' which is called for
1669 * every single leaf node while walking the indexing tree. It checks that the
1670 * leaf node referred from the indexing tree exists, has correct CRC, and does
1671 * some other basic validation. This function is also responsible for building
1672 * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
1673 * calculates reference count, size, etc for each inode in order to later
1674 * compare them to the information stored inside the inodes and detect possible
1675 * inconsistencies. Returns zero in case of success and a negative error code
1676 * in case of failure.
1677 */
1678static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
1679 void *priv)
1680{
1681 ino_t inum;
1682 void *node;
1683 struct ubifs_ch *ch;
1684 int err, type = key_type(c, &zbr->key);
1685 struct fsck_inode *fscki;
1686
1687 if (zbr->len < UBIFS_CH_SZ) {
1688 ubifs_err("bad leaf length %d (LEB %d:%d)",
1689 zbr->len, zbr->lnum, zbr->offs);
1690 return -EINVAL;
1691 }
1692
1693 node = kmalloc(zbr->len, GFP_NOFS);
1694 if (!node)
1695 return -ENOMEM;
1696
1697 err = ubifs_tnc_read_node(c, zbr, node);
1698 if (err) {
1699 ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
1700 zbr->lnum, zbr->offs, err);
1701 goto out_free;
1702 }
1703
1704 /* If this is an inode node, add it to RB-tree of inodes */
1705 if (type == UBIFS_INO_KEY) {
1706 fscki = add_inode(c, priv, node);
1707 if (IS_ERR(fscki)) {
1708 err = PTR_ERR(fscki);
1709 ubifs_err("error %d while adding inode node", err);
1710 goto out_dump;
1711 }
1712 goto out;
1713 }
1714
1715 if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
1716 type != UBIFS_DATA_KEY) {
1717 ubifs_err("unexpected node type %d at LEB %d:%d",
1718 type, zbr->lnum, zbr->offs);
1719 err = -EINVAL;
1720 goto out_free;
1721 }
1722
1723 ch = node;
1724 if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
1725 ubifs_err("too high sequence number, max. is %llu",
1726 c->max_sqnum);
1727 err = -EINVAL;
1728 goto out_dump;
1729 }
1730
1731 if (type == UBIFS_DATA_KEY) {
1732 long long blk_offs;
1733 struct ubifs_data_node *dn = node;
1734
1735 /*
1736 * Search the inode node this data node belongs to and insert
1737 * it to the RB-tree of inodes.
1738 */
1739 inum = key_inum_flash(c, &dn->key);
1740 fscki = read_add_inode(c, priv, inum);
1741 if (IS_ERR(fscki)) {
1742 err = PTR_ERR(fscki);
1743 ubifs_err("error %d while processing data node and "
1744 "trying to find inode node %lu", err, inum);
1745 goto out_dump;
1746 }
1747
1748 /* Make sure the data node is within inode size */
1749 blk_offs = key_block_flash(c, &dn->key);
1750 blk_offs <<= UBIFS_BLOCK_SHIFT;
1751 blk_offs += le32_to_cpu(dn->size);
1752 if (blk_offs > fscki->size) {
1753 ubifs_err("data node at LEB %d:%d is not within inode "
1754 "size %lld", zbr->lnum, zbr->offs,
1755 fscki->size);
1756 err = -EINVAL;
1757 goto out_dump;
1758 }
1759 } else {
1760 int nlen;
1761 struct ubifs_dent_node *dent = node;
1762 struct fsck_inode *fscki1;
1763
1764 err = ubifs_validate_entry(c, dent);
1765 if (err)
1766 goto out_dump;
1767
1768 /*
1769 * Search the inode node this entry refers to and the parent
1770 * inode node and insert them to the RB-tree of inodes.
1771 */
1772 inum = le64_to_cpu(dent->inum);
1773 fscki = read_add_inode(c, priv, inum);
1774 if (IS_ERR(fscki)) {
1775 err = PTR_ERR(fscki);
1776 ubifs_err("error %d while processing entry node and "
1777 "trying to find inode node %lu", err, inum);
1778 goto out_dump;
1779 }
1780
1781 /* Count how many direntries or xentries refers this inode */
1782 fscki->references += 1;
1783
1784 inum = key_inum_flash(c, &dent->key);
1785 fscki1 = read_add_inode(c, priv, inum);
1786 if (IS_ERR(fscki1)) {
1787 err = PTR_ERR(fscki);
1788 ubifs_err("error %d while processing entry node and "
1789 "trying to find parent inode node %lu",
1790 err, inum);
1791 goto out_dump;
1792 }
1793
1794 nlen = le16_to_cpu(dent->nlen);
1795 if (type == UBIFS_XENT_KEY) {
1796 fscki1->calc_xcnt += 1;
1797 fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
1798 fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
1799 fscki1->calc_xnms += nlen;
1800 } else {
1801 fscki1->calc_sz += CALC_DENT_SIZE(nlen);
1802 if (dent->type == UBIFS_ITYPE_DIR)
1803 fscki1->calc_cnt += 1;
1804 }
1805 }
1806
1807out:
1808 kfree(node);
1809 return 0;
1810
1811out_dump:
1812 ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
1813 dbg_dump_node(c, node);
1814out_free:
1815 kfree(node);
1816 return err;
1817}
1818
1819/**
1820 * free_inodes - free RB-tree of inodes.
1821 * @fsckd: FS checking information
1822 */
1823static void free_inodes(struct fsck_data *fsckd)
1824{
1825 struct rb_node *this = fsckd->inodes.rb_node;
1826 struct fsck_inode *fscki;
1827
1828 while (this) {
1829 if (this->rb_left)
1830 this = this->rb_left;
1831 else if (this->rb_right)
1832 this = this->rb_right;
1833 else {
1834 fscki = rb_entry(this, struct fsck_inode, rb);
1835 this = rb_parent(this);
1836 if (this) {
1837 if (this->rb_left == &fscki->rb)
1838 this->rb_left = NULL;
1839 else
1840 this->rb_right = NULL;
1841 }
1842 kfree(fscki);
1843 }
1844 }
1845}
1846
1847/**
1848 * check_inodes - checks all inodes.
1849 * @c: UBIFS file-system description object
1850 * @fsckd: FS checking information
1851 *
1852 * This is a helper function for 'dbg_check_filesystem()' which walks the
1853 * RB-tree of inodes after the index scan has been finished, and checks that
1854 * inode nlink, size, etc are correct. Returns zero if inodes are fine,
1855 * %-EINVAL if not, and a negative error code in case of failure.
1856 */
1857static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
1858{
1859 int n, err;
1860 union ubifs_key key;
1861 struct ubifs_znode *znode;
1862 struct ubifs_zbranch *zbr;
1863 struct ubifs_ino_node *ino;
1864 struct fsck_inode *fscki;
1865 struct rb_node *this = rb_first(&fsckd->inodes);
1866
1867 while (this) {
1868 fscki = rb_entry(this, struct fsck_inode, rb);
1869 this = rb_next(this);
1870
1871 if (S_ISDIR(fscki->mode)) {
1872 /*
1873 * Directories have to have exactly one reference (they
1874 * cannot have hardlinks), although root inode is an
1875 * exception.
1876 */
1877 if (fscki->inum != UBIFS_ROOT_INO &&
1878 fscki->references != 1) {
1879 ubifs_err("directory inode %lu has %d "
1880 "direntries which refer it, but "
1881 "should be 1", fscki->inum,
1882 fscki->references);
1883 goto out_dump;
1884 }
1885 if (fscki->inum == UBIFS_ROOT_INO &&
1886 fscki->references != 0) {
1887 ubifs_err("root inode %lu has non-zero (%d) "
1888 "direntries which refer it",
1889 fscki->inum, fscki->references);
1890 goto out_dump;
1891 }
1892 if (fscki->calc_sz != fscki->size) {
1893 ubifs_err("directory inode %lu size is %lld, "
1894 "but calculated size is %lld",
1895 fscki->inum, fscki->size,
1896 fscki->calc_sz);
1897 goto out_dump;
1898 }
1899 if (fscki->calc_cnt != fscki->nlink) {
1900 ubifs_err("directory inode %lu nlink is %d, "
1901 "but calculated nlink is %d",
1902 fscki->inum, fscki->nlink,
1903 fscki->calc_cnt);
1904 goto out_dump;
1905 }
1906 } else {
1907 if (fscki->references != fscki->nlink) {
1908 ubifs_err("inode %lu nlink is %d, but "
1909 "calculated nlink is %d", fscki->inum,
1910 fscki->nlink, fscki->references);
1911 goto out_dump;
1912 }
1913 }
1914 if (fscki->xattr_sz != fscki->calc_xsz) {
1915 ubifs_err("inode %lu has xattr size %u, but "
1916 "calculated size is %lld",
1917 fscki->inum, fscki->xattr_sz,
1918 fscki->calc_xsz);
1919 goto out_dump;
1920 }
1921 if (fscki->xattr_cnt != fscki->calc_xcnt) {
1922 ubifs_err("inode %lu has %u xattrs, but "
1923 "calculated count is %lld", fscki->inum,
1924 fscki->xattr_cnt, fscki->calc_xcnt);
1925 goto out_dump;
1926 }
1927 if (fscki->xattr_nms != fscki->calc_xnms) {
1928 ubifs_err("inode %lu has xattr names' size %u, but "
1929 "calculated names' size is %lld",
1930 fscki->inum, fscki->xattr_nms,
1931 fscki->calc_xnms);
1932 goto out_dump;
1933 }
1934 }
1935
1936 return 0;
1937
1938out_dump:
1939 /* Read the bad inode and dump it */
1940 ino_key_init(c, &key, fscki->inum);
1941 err = ubifs_lookup_level0(c, &key, &znode, &n);
1942 if (!err) {
1943 ubifs_err("inode %lu not found in index", fscki->inum);
1944 return -ENOENT;
1945 } else if (err < 0) {
1946 ubifs_err("error %d while looking up inode %lu",
1947 err, fscki->inum);
1948 return err;
1949 }
1950
1951 zbr = &znode->zbranch[n];
1952 ino = kmalloc(zbr->len, GFP_NOFS);
1953 if (!ino)
1954 return -ENOMEM;
1955
1956 err = ubifs_tnc_read_node(c, zbr, ino);
1957 if (err) {
1958 ubifs_err("cannot read inode node at LEB %d:%d, error %d",
1959 zbr->lnum, zbr->offs, err);
1960 kfree(ino);
1961 return err;
1962 }
1963
1964 ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
1965 fscki->inum, zbr->lnum, zbr->offs);
1966 dbg_dump_node(c, ino);
1967 kfree(ino);
1968 return -EINVAL;
1969}
1970
1971/**
1972 * dbg_check_filesystem - check the file-system.
1973 * @c: UBIFS file-system description object
1974 *
1975 * This function checks the file system, namely:
1976 * o makes sure that all leaf nodes exist and their CRCs are correct;
1977 * o makes sure inode nlink, size, xattr size/count are correct (for all
1978 * inodes).
1979 *
1980 * The function reads whole indexing tree and all nodes, so it is pretty
1981 * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
1982 * not, and a negative error code in case of failure.
1983 */
1984int dbg_check_filesystem(struct ubifs_info *c)
1985{
1986 int err;
1987 struct fsck_data fsckd;
1988
1989 if (!(ubifs_chk_flags & UBIFS_CHK_FS))
1990 return 0;
1991
1992 fsckd.inodes = RB_ROOT;
1993 err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
1994 if (err)
1995 goto out_free;
1996
1997 err = check_inodes(c, &fsckd);
1998 if (err)
1999 goto out_free;
2000
2001 free_inodes(&fsckd);
2002 return 0;
2003
2004out_free:
2005 ubifs_err("file-system check failed with error %d", err);
2006 dump_stack();
2007 free_inodes(&fsckd);
2008 return err;
2009}
2010
2011static int invocation_cnt;
2012
2013int dbg_force_in_the_gaps(void)
2014{
2015 if (!dbg_force_in_the_gaps_enabled)
2016 return 0;
2017 /* Force in-the-gaps every 8th commit */
2018 return !((invocation_cnt++) & 0x7);
2019}
2020
2021/* Failure mode for recovery testing */
2022
2023#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
2024
2025struct failure_mode_info {
2026 struct list_head list;
2027 struct ubifs_info *c;
2028};
2029
2030static LIST_HEAD(fmi_list);
2031static DEFINE_SPINLOCK(fmi_lock);
2032
2033static unsigned int next;
2034
2035static int simple_rand(void)
2036{
2037 if (next == 0)
2038 next = current->pid;
2039 next = next * 1103515245 + 12345;
2040 return (next >> 16) & 32767;
2041}
2042
2043void dbg_failure_mode_registration(struct ubifs_info *c)
2044{
2045 struct failure_mode_info *fmi;
2046
2047 fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
2048 if (!fmi) {
2049 dbg_err("Failed to register failure mode - no memory");
2050 return;
2051 }
2052 fmi->c = c;
2053 spin_lock(&fmi_lock);
2054 list_add_tail(&fmi->list, &fmi_list);
2055 spin_unlock(&fmi_lock);
2056}
2057
2058void dbg_failure_mode_deregistration(struct ubifs_info *c)
2059{
2060 struct failure_mode_info *fmi, *tmp;
2061
2062 spin_lock(&fmi_lock);
2063 list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
2064 if (fmi->c == c) {
2065 list_del(&fmi->list);
2066 kfree(fmi);
2067 }
2068 spin_unlock(&fmi_lock);
2069}
2070
2071static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
2072{
2073 struct failure_mode_info *fmi;
2074
2075 spin_lock(&fmi_lock);
2076 list_for_each_entry(fmi, &fmi_list, list)
2077 if (fmi->c->ubi == desc) {
2078 struct ubifs_info *c = fmi->c;
2079
2080 spin_unlock(&fmi_lock);
2081 return c;
2082 }
2083 spin_unlock(&fmi_lock);
2084 return NULL;
2085}
2086
2087static int in_failure_mode(struct ubi_volume_desc *desc)
2088{
2089 struct ubifs_info *c = dbg_find_info(desc);
2090
2091 if (c && dbg_failure_mode)
2092 return c->failure_mode;
2093 return 0;
2094}
2095
2096static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
2097{
2098 struct ubifs_info *c = dbg_find_info(desc);
2099
2100 if (!c || !dbg_failure_mode)
2101 return 0;
2102 if (c->failure_mode)
2103 return 1;
2104 if (!c->fail_cnt) {
2105 /* First call - decide delay to failure */
2106 if (chance(1, 2)) {
2107 unsigned int delay = 1 << (simple_rand() >> 11);
2108
2109 if (chance(1, 2)) {
2110 c->fail_delay = 1;
2111 c->fail_timeout = jiffies +
2112 msecs_to_jiffies(delay);
2113 dbg_rcvry("failing after %ums", delay);
2114 } else {
2115 c->fail_delay = 2;
2116 c->fail_cnt_max = delay;
2117 dbg_rcvry("failing after %u calls", delay);
2118 }
2119 }
2120 c->fail_cnt += 1;
2121 }
2122 /* Determine if failure delay has expired */
2123 if (c->fail_delay == 1) {
2124 if (time_before(jiffies, c->fail_timeout))
2125 return 0;
2126 } else if (c->fail_delay == 2)
2127 if (c->fail_cnt++ < c->fail_cnt_max)
2128 return 0;
2129 if (lnum == UBIFS_SB_LNUM) {
2130 if (write) {
2131 if (chance(1, 2))
2132 return 0;
2133 } else if (chance(19, 20))
2134 return 0;
2135 dbg_rcvry("failing in super block LEB %d", lnum);
2136 } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
2137 if (chance(19, 20))
2138 return 0;
2139 dbg_rcvry("failing in master LEB %d", lnum);
2140 } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
2141 if (write) {
2142 if (chance(99, 100))
2143 return 0;
2144 } else if (chance(399, 400))
2145 return 0;
2146 dbg_rcvry("failing in log LEB %d", lnum);
2147 } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
2148 if (write) {
2149 if (chance(7, 8))
2150 return 0;
2151 } else if (chance(19, 20))
2152 return 0;
2153 dbg_rcvry("failing in LPT LEB %d", lnum);
2154 } else if (lnum >= c->orph_first && lnum <= c->orph_last) {
2155 if (write) {
2156 if (chance(1, 2))
2157 return 0;
2158 } else if (chance(9, 10))
2159 return 0;
2160 dbg_rcvry("failing in orphan LEB %d", lnum);
2161 } else if (lnum == c->ihead_lnum) {
2162 if (chance(99, 100))
2163 return 0;
2164 dbg_rcvry("failing in index head LEB %d", lnum);
2165 } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
2166 if (chance(9, 10))
2167 return 0;
2168 dbg_rcvry("failing in GC head LEB %d", lnum);
2169 } else if (write && !RB_EMPTY_ROOT(&c->buds) &&
2170 !ubifs_search_bud(c, lnum)) {
2171 if (chance(19, 20))
2172 return 0;
2173 dbg_rcvry("failing in non-bud LEB %d", lnum);
2174 } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
2175 c->cmt_state == COMMIT_RUNNING_REQUIRED) {
2176 if (chance(999, 1000))
2177 return 0;
2178 dbg_rcvry("failing in bud LEB %d commit running", lnum);
2179 } else {
2180 if (chance(9999, 10000))
2181 return 0;
2182 dbg_rcvry("failing in bud LEB %d commit not running", lnum);
2183 }
2184 ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
2185 c->failure_mode = 1;
2186 dump_stack();
2187 return 1;
2188}
2189
2190static void cut_data(const void *buf, int len)
2191{
2192 int flen, i;
2193 unsigned char *p = (void *)buf;
2194
2195 flen = (len * (long long)simple_rand()) >> 15;
2196 for (i = flen; i < len; i++)
2197 p[i] = 0xff;
2198}
2199
2200int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
2201 int len, int check)
2202{
2203 if (in_failure_mode(desc))
2204 return -EIO;
2205 return ubi_leb_read(desc, lnum, buf, offset, len, check);
2206}
2207
2208int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
2209 int offset, int len, int dtype)
2210{
2211 int err;
2212
2213 if (in_failure_mode(desc))
2214 return -EIO;
2215 if (do_fail(desc, lnum, 1))
2216 cut_data(buf, len);
2217 err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
2218 if (err)
2219 return err;
2220 if (in_failure_mode(desc))
2221 return -EIO;
2222 return 0;
2223}
2224
2225int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
2226 int len, int dtype)
2227{
2228 int err;
2229
2230 if (do_fail(desc, lnum, 1))
2231 return -EIO;
2232 err = ubi_leb_change(desc, lnum, buf, len, dtype);
2233 if (err)
2234 return err;
2235 if (do_fail(desc, lnum, 1))
2236 return -EIO;
2237 return 0;
2238}
2239
2240int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
2241{
2242 int err;
2243
2244 if (do_fail(desc, lnum, 0))
2245 return -EIO;
2246 err = ubi_leb_erase(desc, lnum);
2247 if (err)
2248 return err;
2249 if (do_fail(desc, lnum, 0))
2250 return -EIO;
2251 return 0;
2252}
2253
2254int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
2255{
2256 int err;
2257
2258 if (do_fail(desc, lnum, 0))
2259 return -EIO;
2260 err = ubi_leb_unmap(desc, lnum);
2261 if (err)
2262 return err;
2263 if (do_fail(desc, lnum, 0))
2264 return -EIO;
2265 return 0;
2266}
2267
2268int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
2269{
2270 if (in_failure_mode(desc))
2271 return -EIO;
2272 return ubi_is_mapped(desc, lnum);
2273}
2274
2275int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
2276{
2277 int err;
2278
2279 if (do_fail(desc, lnum, 0))
2280 return -EIO;
2281 err = ubi_leb_map(desc, lnum, dtype);
2282 if (err)
2283 return err;
2284 if (do_fail(desc, lnum, 0))
2285 return -EIO;
2286 return 0;
2287}
2288
2289#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
new file mode 100644
index 000000000000..3c4f1e93c9e0
--- /dev/null
+++ b/fs/ubifs/debug.h
@@ -0,0 +1,403 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23#ifndef __UBIFS_DEBUG_H__
24#define __UBIFS_DEBUG_H__
25
26#ifdef CONFIG_UBIFS_FS_DEBUG
27
28#define UBIFS_DBG(op) op
29
30#define ubifs_assert(expr) do { \
31 if (unlikely(!(expr))) { \
32 printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
33 __func__, __LINE__, current->pid); \
34 dbg_dump_stack(); \
35 } \
36} while (0)
37
38#define ubifs_assert_cmt_locked(c) do { \
39 if (unlikely(down_write_trylock(&(c)->commit_sem))) { \
40 up_write(&(c)->commit_sem); \
41 printk(KERN_CRIT "commit lock is not locked!\n"); \
42 ubifs_assert(0); \
43 } \
44} while (0)
45
46#define dbg_dump_stack() do { \
47 if (!dbg_failure_mode) \
48 dump_stack(); \
49} while (0)
50
51/* Generic debugging messages */
52#define dbg_msg(fmt, ...) do { \
53 spin_lock(&dbg_lock); \
54 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
55 __func__, ##__VA_ARGS__); \
56 spin_unlock(&dbg_lock); \
57} while (0)
58
59#define dbg_do_msg(typ, fmt, ...) do { \
60 if (ubifs_msg_flags & typ) \
61 dbg_msg(fmt, ##__VA_ARGS__); \
62} while (0)
63
64#define dbg_err(fmt, ...) do { \
65 spin_lock(&dbg_lock); \
66 ubifs_err(fmt, ##__VA_ARGS__); \
67 spin_unlock(&dbg_lock); \
68} while (0)
69
70const char *dbg_key_str0(const struct ubifs_info *c,
71 const union ubifs_key *key);
72const char *dbg_key_str1(const struct ubifs_info *c,
73 const union ubifs_key *key);
74
75/*
76 * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
77 * macros.
78 */
79#define DBGKEY(key) dbg_key_str0(c, (key))
80#define DBGKEY1(key) dbg_key_str1(c, (key))
81
82/* General messages */
83#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
84
85/* Additional journal messages */
86#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
87
88/* Additional TNC messages */
89#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
90
91/* Additional lprops messages */
92#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
93
94/* Additional LEB find messages */
95#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
96
97/* Additional mount messages */
98#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
99
100/* Additional I/O messages */
101#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
102
103/* Additional commit messages */
104#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
105
106/* Additional budgeting messages */
107#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
108
109/* Additional log messages */
110#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
111
112/* Additional gc messages */
113#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
114
115/* Additional scan messages */
116#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
117
118/* Additional recovery messages */
119#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
120
121/*
122 * Debugging message type flags (must match msg_type_names in debug.c).
123 *
124 * UBIFS_MSG_GEN: general messages
125 * UBIFS_MSG_JNL: journal messages
126 * UBIFS_MSG_MNT: mount messages
127 * UBIFS_MSG_CMT: commit messages
128 * UBIFS_MSG_FIND: LEB find messages
129 * UBIFS_MSG_BUDG: budgeting messages
130 * UBIFS_MSG_GC: garbage collection messages
131 * UBIFS_MSG_TNC: TNC messages
132 * UBIFS_MSG_LP: lprops messages
133 * UBIFS_MSG_IO: I/O messages
134 * UBIFS_MSG_LOG: log messages
135 * UBIFS_MSG_SCAN: scan messages
136 * UBIFS_MSG_RCVRY: recovery messages
137 */
138enum {
139 UBIFS_MSG_GEN = 0x1,
140 UBIFS_MSG_JNL = 0x2,
141 UBIFS_MSG_MNT = 0x4,
142 UBIFS_MSG_CMT = 0x8,
143 UBIFS_MSG_FIND = 0x10,
144 UBIFS_MSG_BUDG = 0x20,
145 UBIFS_MSG_GC = 0x40,
146 UBIFS_MSG_TNC = 0x80,
147 UBIFS_MSG_LP = 0x100,
148 UBIFS_MSG_IO = 0x200,
149 UBIFS_MSG_LOG = 0x400,
150 UBIFS_MSG_SCAN = 0x800,
151 UBIFS_MSG_RCVRY = 0x1000,
152};
153
154/* Debugging message type flags for each default debug message level */
155#define UBIFS_MSG_LVL_0 0
156#define UBIFS_MSG_LVL_1 0x1
157#define UBIFS_MSG_LVL_2 0x7f
158#define UBIFS_MSG_LVL_3 0xffff
159
160/*
161 * Debugging check flags (must match chk_names in debug.c).
162 *
163 * UBIFS_CHK_GEN: general checks
164 * UBIFS_CHK_TNC: check TNC
165 * UBIFS_CHK_IDX_SZ: check index size
166 * UBIFS_CHK_ORPH: check orphans
167 * UBIFS_CHK_OLD_IDX: check the old index
168 * UBIFS_CHK_LPROPS: check lprops
169 * UBIFS_CHK_FS: check the file-system
170 */
171enum {
172 UBIFS_CHK_GEN = 0x1,
173 UBIFS_CHK_TNC = 0x2,
174 UBIFS_CHK_IDX_SZ = 0x4,
175 UBIFS_CHK_ORPH = 0x8,
176 UBIFS_CHK_OLD_IDX = 0x10,
177 UBIFS_CHK_LPROPS = 0x20,
178 UBIFS_CHK_FS = 0x40,
179};
180
181/*
182 * Special testing flags (must match tst_names in debug.c).
183 *
184 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
185 * UBIFS_TST_RCVRY: failure mode for recovery testing
186 */
187enum {
188 UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
189 UBIFS_TST_RCVRY = 0x4,
190};
191
192#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
193#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
194#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
195#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
196#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
197#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
198#else
199#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
200#endif
201
202#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
203#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
204#else
205#define UBIFS_CHK_FLAGS_DEFAULT 0
206#endif
207
208extern spinlock_t dbg_lock;
209
210extern unsigned int ubifs_msg_flags;
211extern unsigned int ubifs_chk_flags;
212extern unsigned int ubifs_tst_flags;
213
214/* Dump functions */
215
216const char *dbg_ntype(int type);
217const char *dbg_cstate(int cmt_state);
218const char *dbg_get_key_dump(const struct ubifs_info *c,
219 const union ubifs_key *key);
220void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
221void dbg_dump_node(const struct ubifs_info *c, const void *node);
222void dbg_dump_budget_req(const struct ubifs_budget_req *req);
223void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
224void dbg_dump_budg(struct ubifs_info *c);
225void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
226void dbg_dump_lprops(struct ubifs_info *c);
227void dbg_dump_leb(const struct ubifs_info *c, int lnum);
228void dbg_dump_znode(const struct ubifs_info *c,
229 const struct ubifs_znode *znode);
230void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
231void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
232 struct ubifs_nnode *parent, int iip);
233void dbg_dump_tnc(struct ubifs_info *c);
234void dbg_dump_index(struct ubifs_info *c);
235
236/* Checking helper functions */
237
238typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
239 struct ubifs_zbranch *zbr, void *priv);
240typedef int (*dbg_znode_callback)(struct ubifs_info *c,
241 struct ubifs_znode *znode, void *priv);
242
243int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
244 dbg_znode_callback znode_cb, void *priv);
245
246/* Checking functions */
247
248int dbg_check_lprops(struct ubifs_info *c);
249
250int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
251int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
252
253int dbg_check_cats(struct ubifs_info *c);
254
255int dbg_check_ltab(struct ubifs_info *c);
256
257int dbg_check_synced_i_size(struct inode *inode);
258
259int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
260
261int dbg_check_tnc(struct ubifs_info *c, int extra);
262
263int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
264
265int dbg_check_filesystem(struct ubifs_info *c);
266
267void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
268 int add_pos);
269
270int dbg_check_lprops(struct ubifs_info *c);
271int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
272 int row, int col);
273
274/* Force the use of in-the-gaps method for testing */
275
276#define dbg_force_in_the_gaps_enabled \
277 (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
278
279int dbg_force_in_the_gaps(void);
280
281/* Failure mode for recovery testing */
282
283#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
284
285void dbg_failure_mode_registration(struct ubifs_info *c);
286void dbg_failure_mode_deregistration(struct ubifs_info *c);
287
288#ifndef UBIFS_DBG_PRESERVE_UBI
289
290#define ubi_leb_read dbg_leb_read
291#define ubi_leb_write dbg_leb_write
292#define ubi_leb_change dbg_leb_change
293#define ubi_leb_erase dbg_leb_erase
294#define ubi_leb_unmap dbg_leb_unmap
295#define ubi_is_mapped dbg_is_mapped
296#define ubi_leb_map dbg_leb_map
297
298#endif
299
300int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
301 int len, int check);
302int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
303 int offset, int len, int dtype);
304int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
305 int len, int dtype);
306int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
307int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
308int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
309int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
310
311static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
312 int offset, int len)
313{
314 return dbg_leb_read(desc, lnum, buf, offset, len, 0);
315}
316
317static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
318 const void *buf, int offset, int len)
319{
320 return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
321}
322
323static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
324 const void *buf, int len)
325{
326 return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
327}
328
329#else /* !CONFIG_UBIFS_FS_DEBUG */
330
331#define UBIFS_DBG(op)
332#define ubifs_assert(expr) ({})
333#define ubifs_assert_cmt_locked(c)
334#define dbg_dump_stack()
335#define dbg_err(fmt, ...) ({})
336#define dbg_msg(fmt, ...) ({})
337#define dbg_key(c, key, fmt, ...) ({})
338
339#define dbg_gen(fmt, ...) ({})
340#define dbg_jnl(fmt, ...) ({})
341#define dbg_tnc(fmt, ...) ({})
342#define dbg_lp(fmt, ...) ({})
343#define dbg_find(fmt, ...) ({})
344#define dbg_mnt(fmt, ...) ({})
345#define dbg_io(fmt, ...) ({})
346#define dbg_cmt(fmt, ...) ({})
347#define dbg_budg(fmt, ...) ({})
348#define dbg_log(fmt, ...) ({})
349#define dbg_gc(fmt, ...) ({})
350#define dbg_scan(fmt, ...) ({})
351#define dbg_rcvry(fmt, ...) ({})
352
353#define dbg_ntype(type) ""
354#define dbg_cstate(cmt_state) ""
355#define dbg_get_key_dump(c, key) ({})
356#define dbg_dump_inode(c, inode) ({})
357#define dbg_dump_node(c, node) ({})
358#define dbg_dump_budget_req(req) ({})
359#define dbg_dump_lstats(lst) ({})
360#define dbg_dump_budg(c) ({})
361#define dbg_dump_lprop(c, lp) ({})
362#define dbg_dump_lprops(c) ({})
363#define dbg_dump_leb(c, lnum) ({})
364#define dbg_dump_znode(c, znode) ({})
365#define dbg_dump_heap(c, heap, cat) ({})
366#define dbg_dump_pnode(c, pnode, parent, iip) ({})
367#define dbg_dump_tnc(c) ({})
368#define dbg_dump_index(c) ({})
369
370#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
371
372#define dbg_old_index_check_init(c, zroot) 0
373#define dbg_check_old_index(c, zroot) 0
374
375#define dbg_check_cats(c) 0
376
377#define dbg_check_ltab(c) 0
378
379#define dbg_check_synced_i_size(inode) 0
380
381#define dbg_check_dir_size(c, dir) 0
382
383#define dbg_check_tnc(c, x) 0
384
385#define dbg_check_idx_size(c, idx_size) 0
386
387#define dbg_check_filesystem(c) 0
388
389#define dbg_check_heap(c, heap, cat, add_pos) ({})
390
391#define dbg_check_lprops(c) 0
392#define dbg_check_lpt_nodes(c, cnode, row, col) 0
393
394#define dbg_force_in_the_gaps_enabled 0
395#define dbg_force_in_the_gaps() 0
396
397#define dbg_failure_mode 0
398#define dbg_failure_mode_registration(c) ({})
399#define dbg_failure_mode_deregistration(c) ({})
400
401#endif /* !CONFIG_UBIFS_FS_DEBUG */
402
403#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
new file mode 100644
index 000000000000..e90374be7d3b
--- /dev/null
+++ b/fs/ubifs/dir.c
@@ -0,0 +1,1240 @@
1/* * This file is part of UBIFS.
2 *
3 * Copyright (C) 2006-2008 Nokia Corporation.
4 * Copyright (C) 2006, 2007 University of Szeged, Hungary
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 * Zoltan Sogor
22 */
23
24/*
25 * This file implements directory operations.
26 *
27 * All FS operations in this file allocate budget before writing anything to the
28 * media. If they fail to allocate it, the error is returned. The only
29 * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
30 * if they unable to allocate the budget, because deletion %-ENOSPC failure is
31 * not what users are usually ready to get. UBIFS budgeting subsystem has some
32 * space reserved for these purposes.
33 *
34 * All operations in this file write all inodes which they change straight
35 * away, instead of marking them dirty. For example, 'ubifs_link()' changes
36 * @i_size of the parent inode and writes the parent inode together with the
37 * target inode. This was done to simplify file-system recovery which would
38 * otherwise be very difficult to do. The only exception is rename which marks
39 * the re-named inode dirty (because its @i_ctime is updated) but does not
40 * write it, but just marks it as dirty.
41 */
42
43#include "ubifs.h"
44
45/**
46 * inherit_flags - inherit flags of the parent inode.
47 * @dir: parent inode
48 * @mode: new inode mode flags
49 *
50 * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
51 * parent directory inode @dir. UBIFS inodes inherit the following flags:
52 * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
53 * sub-directory basis;
54 * o %UBIFS_SYNC_FL - useful for the same reasons;
55 * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
56 *
57 * This function returns the inherited flags.
58 */
59static int inherit_flags(const struct inode *dir, int mode)
60{
61 int flags;
62 const struct ubifs_inode *ui = ubifs_inode(dir);
63
64 if (!S_ISDIR(dir->i_mode))
65 /*
66 * The parent is not a directory, which means that an extended
67 * attribute inode is being created. No flags.
68 */
69 return 0;
70
71 flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
72 if (!S_ISDIR(mode))
73 /* The "DIRSYNC" flag only applies to directories */
74 flags &= ~UBIFS_DIRSYNC_FL;
75 return flags;
76}
77
78/**
79 * ubifs_new_inode - allocate new UBIFS inode object.
80 * @c: UBIFS file-system description object
81 * @dir: parent directory inode
82 * @mode: inode mode flags
83 *
84 * This function finds an unused inode number, allocates new inode and
85 * initializes it. Returns new inode in case of success and an error code in
86 * case of failure.
87 */
88struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
89 int mode)
90{
91 struct inode *inode;
92 struct ubifs_inode *ui;
93
94 inode = new_inode(c->vfs_sb);
95 ui = ubifs_inode(inode);
96 if (!inode)
97 return ERR_PTR(-ENOMEM);
98
99 /*
100 * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
101 * marking them dirty in file write path (see 'file_update_time()').
102 * UBIFS has to fully control "clean <-> dirty" transitions of inodes
103 * to make budgeting work.
104 */
105 inode->i_flags |= (S_NOCMTIME);
106
107 inode->i_uid = current->fsuid;
108 if (dir->i_mode & S_ISGID) {
109 inode->i_gid = dir->i_gid;
110 if (S_ISDIR(mode))
111 mode |= S_ISGID;
112 } else
113 inode->i_gid = current->fsgid;
114 inode->i_mode = mode;
115 inode->i_mtime = inode->i_atime = inode->i_ctime =
116 ubifs_current_time(inode);
117 inode->i_mapping->nrpages = 0;
118 /* Disable readahead */
119 inode->i_mapping->backing_dev_info = &c->bdi;
120
121 switch (mode & S_IFMT) {
122 case S_IFREG:
123 inode->i_mapping->a_ops = &ubifs_file_address_operations;
124 inode->i_op = &ubifs_file_inode_operations;
125 inode->i_fop = &ubifs_file_operations;
126 break;
127 case S_IFDIR:
128 inode->i_op = &ubifs_dir_inode_operations;
129 inode->i_fop = &ubifs_dir_operations;
130 inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
131 break;
132 case S_IFLNK:
133 inode->i_op = &ubifs_symlink_inode_operations;
134 break;
135 case S_IFSOCK:
136 case S_IFIFO:
137 case S_IFBLK:
138 case S_IFCHR:
139 inode->i_op = &ubifs_file_inode_operations;
140 break;
141 default:
142 BUG();
143 }
144
145 ui->flags = inherit_flags(dir, mode);
146 ubifs_set_inode_flags(inode);
147 if (S_ISREG(mode))
148 ui->compr_type = c->default_compr;
149 else
150 ui->compr_type = UBIFS_COMPR_NONE;
151 ui->synced_i_size = 0;
152
153 spin_lock(&c->cnt_lock);
154 /* Inode number overflow is currently not supported */
155 if (c->highest_inum >= INUM_WARN_WATERMARK) {
156 if (c->highest_inum >= INUM_WATERMARK) {
157 spin_unlock(&c->cnt_lock);
158 ubifs_err("out of inode numbers");
159 make_bad_inode(inode);
160 iput(inode);
161 return ERR_PTR(-EINVAL);
162 }
163 ubifs_warn("running out of inode numbers (current %lu, max %d)",
164 c->highest_inum, INUM_WATERMARK);
165 }
166
167 inode->i_ino = ++c->highest_inum;
168 inode->i_generation = ++c->vfs_gen;
169 /*
170 * The creation sequence number remains with this inode for its
171 * lifetime. All nodes for this inode have a greater sequence number,
172 * and so it is possible to distinguish obsolete nodes belonging to a
173 * previous incarnation of the same inode number - for example, for the
174 * purpose of rebuilding the index.
175 */
176 ui->creat_sqnum = ++c->max_sqnum;
177 spin_unlock(&c->cnt_lock);
178 return inode;
179}
180
181#ifdef CONFIG_UBIFS_FS_DEBUG
182
183static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
184{
185 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
186 return 0;
187 if (le16_to_cpu(dent->nlen) != nm->len)
188 return -EINVAL;
189 if (memcmp(dent->name, nm->name, nm->len))
190 return -EINVAL;
191 return 0;
192}
193
194#else
195
196#define dbg_check_name(dent, nm) 0
197
198#endif
199
200static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
201 struct nameidata *nd)
202{
203 int err;
204 union ubifs_key key;
205 struct inode *inode = NULL;
206 struct ubifs_dent_node *dent;
207 struct ubifs_info *c = dir->i_sb->s_fs_info;
208
209 dbg_gen("'%.*s' in dir ino %lu",
210 dentry->d_name.len, dentry->d_name.name, dir->i_ino);
211
212 if (dentry->d_name.len > UBIFS_MAX_NLEN)
213 return ERR_PTR(-ENAMETOOLONG);
214
215 dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
216 if (!dent)
217 return ERR_PTR(-ENOMEM);
218
219 dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
220
221 err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
222 if (err) {
223 /*
224 * Do not hash the direntry if parent 'i_nlink' is zero, because
225 * this has side-effects - '->delete_inode()' call will not be
226 * called for the parent orphan inode, because 'd_count' of its
227 * direntry will stay 1 (it'll be negative direntry I guess)
228 * and prevent 'iput_final()' until the dentry is destroyed due
229 * to unmount or memory pressure.
230 */
231 if (err == -ENOENT && dir->i_nlink != 0) {
232 dbg_gen("not found");
233 goto done;
234 }
235 goto out;
236 }
237
238 if (dbg_check_name(dent, &dentry->d_name)) {
239 err = -EINVAL;
240 goto out;
241 }
242
243 inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
244 if (IS_ERR(inode)) {
245 /*
246 * This should not happen. Probably the file-system needs
247 * checking.
248 */
249 err = PTR_ERR(inode);
250 ubifs_err("dead directory entry '%.*s', error %d",
251 dentry->d_name.len, dentry->d_name.name, err);
252 ubifs_ro_mode(c, err);
253 goto out;
254 }
255
256done:
257 kfree(dent);
258 /*
259 * Note, d_splice_alias() would be required instead if we supported
260 * NFS.
261 */
262 d_add(dentry, inode);
263 return NULL;
264
265out:
266 kfree(dent);
267 return ERR_PTR(err);
268}
269
270static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
271 struct nameidata *nd)
272{
273 struct inode *inode;
274 struct ubifs_info *c = dir->i_sb->s_fs_info;
275 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
276 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
277 .dirtied_ino = 1 };
278 struct ubifs_inode *dir_ui = ubifs_inode(dir);
279
280 /*
281 * Budget request settings: new inode, new direntry, changing the
282 * parent directory inode.
283 */
284
285 dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
286 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
287
288 err = ubifs_budget_space(c, &req);
289 if (err)
290 return err;
291
292 inode = ubifs_new_inode(c, dir, mode);
293 if (IS_ERR(inode)) {
294 err = PTR_ERR(inode);
295 goto out_budg;
296 }
297
298 mutex_lock(&dir_ui->ui_mutex);
299 dir->i_size += sz_change;
300 dir_ui->ui_size = dir->i_size;
301 dir->i_mtime = dir->i_ctime = inode->i_ctime;
302 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
303 if (err)
304 goto out_cancel;
305 mutex_unlock(&dir_ui->ui_mutex);
306
307 ubifs_release_budget(c, &req);
308 insert_inode_hash(inode);
309 d_instantiate(dentry, inode);
310 return 0;
311
312out_cancel:
313 dir->i_size -= sz_change;
314 dir_ui->ui_size = dir->i_size;
315 mutex_unlock(&dir_ui->ui_mutex);
316 make_bad_inode(inode);
317 iput(inode);
318out_budg:
319 ubifs_release_budget(c, &req);
320 ubifs_err("cannot create regular file, error %d", err);
321 return err;
322}
323
324/**
325 * vfs_dent_type - get VFS directory entry type.
326 * @type: UBIFS directory entry type
327 *
328 * This function converts UBIFS directory entry type into VFS directory entry
329 * type.
330 */
331static unsigned int vfs_dent_type(uint8_t type)
332{
333 switch (type) {
334 case UBIFS_ITYPE_REG:
335 return DT_REG;
336 case UBIFS_ITYPE_DIR:
337 return DT_DIR;
338 case UBIFS_ITYPE_LNK:
339 return DT_LNK;
340 case UBIFS_ITYPE_BLK:
341 return DT_BLK;
342 case UBIFS_ITYPE_CHR:
343 return DT_CHR;
344 case UBIFS_ITYPE_FIFO:
345 return DT_FIFO;
346 case UBIFS_ITYPE_SOCK:
347 return DT_SOCK;
348 default:
349 BUG();
350 }
351 return 0;
352}
353
354/*
355 * The classical Unix view for directory is that it is a linear array of
356 * (name, inode number) entries. Linux/VFS assumes this model as well.
357 * Particularly, 'readdir()' call wants us to return a directory entry offset
358 * which later may be used to continue 'readdir()'ing the directory or to
359 * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
360 * model because directory entries are identified by keys, which may collide.
361 *
362 * UBIFS uses directory entry hash value for directory offsets, so
363 * 'seekdir()'/'telldir()' may not always work because of possible key
364 * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
365 * properly by means of saving full directory entry name in the private field
366 * of the file description object.
367 *
368 * This means that UBIFS cannot support NFS which requires full
369 * 'seekdir()'/'telldir()' support.
370 */
371static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
372{
373 int err, over = 0;
374 struct qstr nm;
375 union ubifs_key key;
376 struct ubifs_dent_node *dent;
377 struct inode *dir = file->f_path.dentry->d_inode;
378 struct ubifs_info *c = dir->i_sb->s_fs_info;
379
380 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
381
382 if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
383 /*
384 * The directory was seek'ed to a senseless position or there
385 * are no more entries.
386 */
387 return 0;
388
389 /* File positions 0 and 1 correspond to "." and ".." */
390 if (file->f_pos == 0) {
391 ubifs_assert(!file->private_data);
392 over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
393 if (over)
394 return 0;
395 file->f_pos = 1;
396 }
397
398 if (file->f_pos == 1) {
399 ubifs_assert(!file->private_data);
400 over = filldir(dirent, "..", 2, 1,
401 parent_ino(file->f_path.dentry), DT_DIR);
402 if (over)
403 return 0;
404
405 /* Find the first entry in TNC and save it */
406 lowest_dent_key(c, &key, dir->i_ino);
407 nm.name = NULL;
408 dent = ubifs_tnc_next_ent(c, &key, &nm);
409 if (IS_ERR(dent)) {
410 err = PTR_ERR(dent);
411 goto out;
412 }
413
414 file->f_pos = key_hash_flash(c, &dent->key);
415 file->private_data = dent;
416 }
417
418 dent = file->private_data;
419 if (!dent) {
420 /*
421 * The directory was seek'ed to and is now readdir'ed.
422 * Find the entry corresponding to @file->f_pos or the
423 * closest one.
424 */
425 dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
426 nm.name = NULL;
427 dent = ubifs_tnc_next_ent(c, &key, &nm);
428 if (IS_ERR(dent)) {
429 err = PTR_ERR(dent);
430 goto out;
431 }
432 file->f_pos = key_hash_flash(c, &dent->key);
433 file->private_data = dent;
434 }
435
436 while (1) {
437 dbg_gen("feed '%s', ino %llu, new f_pos %#x",
438 dent->name, le64_to_cpu(dent->inum),
439 key_hash_flash(c, &dent->key));
440 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
441
442 nm.len = le16_to_cpu(dent->nlen);
443 over = filldir(dirent, dent->name, nm.len, file->f_pos,
444 le64_to_cpu(dent->inum),
445 vfs_dent_type(dent->type));
446 if (over)
447 return 0;
448
449 /* Switch to the next entry */
450 key_read(c, &dent->key, &key);
451 nm.name = dent->name;
452 dent = ubifs_tnc_next_ent(c, &key, &nm);
453 if (IS_ERR(dent)) {
454 err = PTR_ERR(dent);
455 goto out;
456 }
457
458 kfree(file->private_data);
459 file->f_pos = key_hash_flash(c, &dent->key);
460 file->private_data = dent;
461 cond_resched();
462 }
463
464out:
465 if (err != -ENOENT) {
466 ubifs_err("cannot find next direntry, error %d", err);
467 return err;
468 }
469
470 kfree(file->private_data);
471 file->private_data = NULL;
472 file->f_pos = 2;
473 return 0;
474}
475
476/* If a directory is seeked, we have to free saved readdir() state */
477static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
478{
479 kfree(file->private_data);
480 file->private_data = NULL;
481 return generic_file_llseek(file, offset, origin);
482}
483
484/* Free saved readdir() state when the directory is closed */
485static int ubifs_dir_release(struct inode *dir, struct file *file)
486{
487 kfree(file->private_data);
488 file->private_data = NULL;
489 return 0;
490}
491
492/**
493 * lock_2_inodes - lock two UBIFS inodes.
494 * @inode1: first inode
495 * @inode2: second inode
496 */
497static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
498{
499 if (inode1->i_ino < inode2->i_ino) {
500 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
501 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
502 } else {
503 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
504 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
505 }
506}
507
508/**
509 * unlock_2_inodes - unlock two UBIFS inodes inodes.
510 * @inode1: first inode
511 * @inode2: second inode
512 */
513static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
514{
515 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
516 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
517}
518
519static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
520 struct dentry *dentry)
521{
522 struct ubifs_info *c = dir->i_sb->s_fs_info;
523 struct inode *inode = old_dentry->d_inode;
524 struct ubifs_inode *ui = ubifs_inode(inode);
525 struct ubifs_inode *dir_ui = ubifs_inode(dir);
526 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
527 struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
528 .dirtied_ino_d = ui->data_len };
529
530 /*
531 * Budget request settings: new direntry, changing the target inode,
532 * changing the parent inode.
533 */
534
535 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
536 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
537 inode->i_nlink, dir->i_ino);
538 err = dbg_check_synced_i_size(inode);
539 if (err)
540 return err;
541
542 err = ubifs_budget_space(c, &req);
543 if (err)
544 return err;
545
546 lock_2_inodes(dir, inode);
547 inc_nlink(inode);
548 atomic_inc(&inode->i_count);
549 inode->i_ctime = ubifs_current_time(inode);
550 dir->i_size += sz_change;
551 dir_ui->ui_size = dir->i_size;
552 dir->i_mtime = dir->i_ctime = inode->i_ctime;
553 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
554 if (err)
555 goto out_cancel;
556 unlock_2_inodes(dir, inode);
557
558 ubifs_release_budget(c, &req);
559 d_instantiate(dentry, inode);
560 return 0;
561
562out_cancel:
563 dir->i_size -= sz_change;
564 dir_ui->ui_size = dir->i_size;
565 drop_nlink(inode);
566 unlock_2_inodes(dir, inode);
567 ubifs_release_budget(c, &req);
568 iput(inode);
569 return err;
570}
571
572static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
573{
574 struct ubifs_info *c = dir->i_sb->s_fs_info;
575 struct inode *inode = dentry->d_inode;
576 struct ubifs_inode *dir_ui = ubifs_inode(dir);
577 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
578 int err, budgeted = 1;
579 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
580
581 /*
582 * Budget request settings: deletion direntry, deletion inode (+1 for
583 * @dirtied_ino), changing the parent directory inode. If budgeting
584 * fails, go ahead anyway because we have extra space reserved for
585 * deletions.
586 */
587
588 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
589 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
590 inode->i_nlink, dir->i_ino);
591 err = dbg_check_synced_i_size(inode);
592 if (err)
593 return err;
594
595 err = ubifs_budget_space(c, &req);
596 if (err) {
597 if (err != -ENOSPC)
598 return err;
599 err = 0;
600 budgeted = 0;
601 }
602
603 lock_2_inodes(dir, inode);
604 inode->i_ctime = ubifs_current_time(dir);
605 drop_nlink(inode);
606 dir->i_size -= sz_change;
607 dir_ui->ui_size = dir->i_size;
608 dir->i_mtime = dir->i_ctime = inode->i_ctime;
609 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
610 if (err)
611 goto out_cancel;
612 unlock_2_inodes(dir, inode);
613
614 if (budgeted)
615 ubifs_release_budget(c, &req);
616 else {
617 /* We've deleted something - clean the "no space" flags */
618 c->nospace = c->nospace_rp = 0;
619 smp_wmb();
620 }
621 return 0;
622
623out_cancel:
624 dir->i_size += sz_change;
625 dir_ui->ui_size = dir->i_size;
626 inc_nlink(inode);
627 unlock_2_inodes(dir, inode);
628 if (budgeted)
629 ubifs_release_budget(c, &req);
630 return err;
631}
632
633/**
634 * check_dir_empty - check if a directory is empty or not.
635 * @c: UBIFS file-system description object
636 * @dir: VFS inode object of the directory to check
637 *
638 * This function checks if directory @dir is empty. Returns zero if the
639 * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
640 * in case of of errors.
641 */
642static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
643{
644 struct qstr nm = { .name = NULL };
645 struct ubifs_dent_node *dent;
646 union ubifs_key key;
647 int err;
648
649 lowest_dent_key(c, &key, dir->i_ino);
650 dent = ubifs_tnc_next_ent(c, &key, &nm);
651 if (IS_ERR(dent)) {
652 err = PTR_ERR(dent);
653 if (err == -ENOENT)
654 err = 0;
655 } else {
656 kfree(dent);
657 err = -ENOTEMPTY;
658 }
659 return err;
660}
661
662static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
663{
664 struct ubifs_info *c = dir->i_sb->s_fs_info;
665 struct inode *inode = dentry->d_inode;
666 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
667 int err, budgeted = 1;
668 struct ubifs_inode *dir_ui = ubifs_inode(dir);
669 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
670
671 /*
672 * Budget request settings: deletion direntry, deletion inode and
673 * changing the parent inode. If budgeting fails, go ahead anyway
674 * because we have extra space reserved for deletions.
675 */
676
677 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
678 dentry->d_name.name, inode->i_ino, dir->i_ino);
679
680 err = check_dir_empty(c, dentry->d_inode);
681 if (err)
682 return err;
683
684 err = ubifs_budget_space(c, &req);
685 if (err) {
686 if (err != -ENOSPC)
687 return err;
688 budgeted = 0;
689 }
690
691 lock_2_inodes(dir, inode);
692 inode->i_ctime = ubifs_current_time(dir);
693 clear_nlink(inode);
694 drop_nlink(dir);
695 dir->i_size -= sz_change;
696 dir_ui->ui_size = dir->i_size;
697 dir->i_mtime = dir->i_ctime = inode->i_ctime;
698 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
699 if (err)
700 goto out_cancel;
701 unlock_2_inodes(dir, inode);
702
703 if (budgeted)
704 ubifs_release_budget(c, &req);
705 else {
706 /* We've deleted something - clean the "no space" flags */
707 c->nospace = c->nospace_rp = 0;
708 smp_wmb();
709 }
710 return 0;
711
712out_cancel:
713 dir->i_size += sz_change;
714 dir_ui->ui_size = dir->i_size;
715 inc_nlink(dir);
716 inc_nlink(inode);
717 inc_nlink(inode);
718 unlock_2_inodes(dir, inode);
719 if (budgeted)
720 ubifs_release_budget(c, &req);
721 return err;
722}
723
724static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
725{
726 struct inode *inode;
727 struct ubifs_inode *dir_ui = ubifs_inode(dir);
728 struct ubifs_info *c = dir->i_sb->s_fs_info;
729 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
730 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
731 .dirtied_ino_d = 1 };
732
733 /*
734 * Budget request settings: new inode, new direntry and changing parent
735 * directory inode.
736 */
737
738 dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
739 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
740
741 err = ubifs_budget_space(c, &req);
742 if (err)
743 return err;
744
745 inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
746 if (IS_ERR(inode)) {
747 err = PTR_ERR(inode);
748 goto out_budg;
749 }
750
751 mutex_lock(&dir_ui->ui_mutex);
752 insert_inode_hash(inode);
753 inc_nlink(inode);
754 inc_nlink(dir);
755 dir->i_size += sz_change;
756 dir_ui->ui_size = dir->i_size;
757 dir->i_mtime = dir->i_ctime = inode->i_ctime;
758 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
759 if (err) {
760 ubifs_err("cannot create directory, error %d", err);
761 goto out_cancel;
762 }
763 mutex_unlock(&dir_ui->ui_mutex);
764
765 ubifs_release_budget(c, &req);
766 d_instantiate(dentry, inode);
767 return 0;
768
769out_cancel:
770 dir->i_size -= sz_change;
771 dir_ui->ui_size = dir->i_size;
772 drop_nlink(dir);
773 mutex_unlock(&dir_ui->ui_mutex);
774 make_bad_inode(inode);
775 iput(inode);
776out_budg:
777 ubifs_release_budget(c, &req);
778 return err;
779}
780
781static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
782 int mode, dev_t rdev)
783{
784 struct inode *inode;
785 struct ubifs_inode *ui;
786 struct ubifs_inode *dir_ui = ubifs_inode(dir);
787 struct ubifs_info *c = dir->i_sb->s_fs_info;
788 union ubifs_dev_desc *dev = NULL;
789 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
790 int err, devlen = 0;
791 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
792 .new_ino_d = devlen, .dirtied_ino = 1 };
793
794 /*
795 * Budget request settings: new inode, new direntry and changing parent
796 * directory inode.
797 */
798
799 dbg_gen("dent '%.*s' in dir ino %lu",
800 dentry->d_name.len, dentry->d_name.name, dir->i_ino);
801
802 if (!new_valid_dev(rdev))
803 return -EINVAL;
804
805 if (S_ISBLK(mode) || S_ISCHR(mode)) {
806 dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
807 if (!dev)
808 return -ENOMEM;
809 devlen = ubifs_encode_dev(dev, rdev);
810 }
811
812 err = ubifs_budget_space(c, &req);
813 if (err) {
814 kfree(dev);
815 return err;
816 }
817
818 inode = ubifs_new_inode(c, dir, mode);
819 if (IS_ERR(inode)) {
820 kfree(dev);
821 err = PTR_ERR(inode);
822 goto out_budg;
823 }
824
825 init_special_inode(inode, inode->i_mode, rdev);
826 inode->i_size = ubifs_inode(inode)->ui_size = devlen;
827 ui = ubifs_inode(inode);
828 ui->data = dev;
829 ui->data_len = devlen;
830
831 mutex_lock(&dir_ui->ui_mutex);
832 dir->i_size += sz_change;
833 dir_ui->ui_size = dir->i_size;
834 dir->i_mtime = dir->i_ctime = inode->i_ctime;
835 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
836 if (err)
837 goto out_cancel;
838 mutex_unlock(&dir_ui->ui_mutex);
839
840 ubifs_release_budget(c, &req);
841 insert_inode_hash(inode);
842 d_instantiate(dentry, inode);
843 return 0;
844
845out_cancel:
846 dir->i_size -= sz_change;
847 dir_ui->ui_size = dir->i_size;
848 mutex_unlock(&dir_ui->ui_mutex);
849 make_bad_inode(inode);
850 iput(inode);
851out_budg:
852 ubifs_release_budget(c, &req);
853 return err;
854}
855
856static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
857 const char *symname)
858{
859 struct inode *inode;
860 struct ubifs_inode *ui;
861 struct ubifs_inode *dir_ui = ubifs_inode(dir);
862 struct ubifs_info *c = dir->i_sb->s_fs_info;
863 int err, len = strlen(symname);
864 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
865 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
866 .new_ino_d = len, .dirtied_ino = 1 };
867
868 /*
869 * Budget request settings: new inode, new direntry and changing parent
870 * directory inode.
871 */
872
873 dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
874 dentry->d_name.name, symname, dir->i_ino);
875
876 if (len > UBIFS_MAX_INO_DATA)
877 return -ENAMETOOLONG;
878
879 err = ubifs_budget_space(c, &req);
880 if (err)
881 return err;
882
883 inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
884 if (IS_ERR(inode)) {
885 err = PTR_ERR(inode);
886 goto out_budg;
887 }
888
889 ui = ubifs_inode(inode);
890 ui->data = kmalloc(len + 1, GFP_NOFS);
891 if (!ui->data) {
892 err = -ENOMEM;
893 goto out_inode;
894 }
895
896 memcpy(ui->data, symname, len);
897 ((char *)ui->data)[len] = '\0';
898 /*
899 * The terminating zero byte is not written to the flash media and it
900 * is put just to make later in-memory string processing simpler. Thus,
901 * data length is @len, not @len + %1.
902 */
903 ui->data_len = len;
904 inode->i_size = ubifs_inode(inode)->ui_size = len;
905
906 mutex_lock(&dir_ui->ui_mutex);
907 dir->i_size += sz_change;
908 dir_ui->ui_size = dir->i_size;
909 dir->i_mtime = dir->i_ctime = inode->i_ctime;
910 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
911 if (err)
912 goto out_cancel;
913 mutex_unlock(&dir_ui->ui_mutex);
914
915 ubifs_release_budget(c, &req);
916 insert_inode_hash(inode);
917 d_instantiate(dentry, inode);
918 return 0;
919
920out_cancel:
921 dir->i_size -= sz_change;
922 dir_ui->ui_size = dir->i_size;
923 mutex_unlock(&dir_ui->ui_mutex);
924out_inode:
925 make_bad_inode(inode);
926 iput(inode);
927out_budg:
928 ubifs_release_budget(c, &req);
929 return err;
930}
931
932/**
933 * lock_3_inodes - lock three UBIFS inodes for rename.
934 * @inode1: first inode
935 * @inode2: second inode
936 * @inode3: third inode
937 *
938 * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
939 * be null.
940 */
941static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
942 struct inode *inode3)
943{
944 struct inode *i1, *i2, *i3;
945
946 if (!inode3) {
947 if (inode1 != inode2) {
948 lock_2_inodes(inode1, inode2);
949 return;
950 }
951 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
952 return;
953 }
954
955 if (inode1 == inode2) {
956 lock_2_inodes(inode1, inode3);
957 return;
958 }
959
960 /* 3 different inodes */
961 if (inode1 < inode2) {
962 i3 = inode2;
963 if (inode1 < inode3) {
964 i1 = inode1;
965 i2 = inode3;
966 } else {
967 i1 = inode3;
968 i2 = inode1;
969 }
970 } else {
971 i3 = inode1;
972 if (inode2 < inode3) {
973 i1 = inode2;
974 i2 = inode3;
975 } else {
976 i1 = inode3;
977 i2 = inode2;
978 }
979 }
980 mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
981 lock_2_inodes(i2, i3);
982}
983
984/**
985 * unlock_3_inodes - unlock three UBIFS inodes for rename.
986 * @inode1: first inode
987 * @inode2: second inode
988 * @inode3: third inode
989 */
990static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
991 struct inode *inode3)
992{
993 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
994 if (inode1 != inode2)
995 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
996 if (inode3)
997 mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
998}
999
1000static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1001 struct inode *new_dir, struct dentry *new_dentry)
1002{
1003 struct ubifs_info *c = old_dir->i_sb->s_fs_info;
1004 struct inode *old_inode = old_dentry->d_inode;
1005 struct inode *new_inode = new_dentry->d_inode;
1006 struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
1007 int err, release, sync = 0, move = (new_dir != old_dir);
1008 int is_dir = S_ISDIR(old_inode->i_mode);
1009 int unlink = !!new_inode;
1010 int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
1011 int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
1012 struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
1013 .dirtied_ino = 3 };
1014 struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
1015 .dirtied_ino_d = old_inode_ui->data_len };
1016 struct timespec time;
1017
1018 /*
1019 * Budget request settings: deletion direntry, new direntry, removing
1020 * the old inode, and changing old and new parent directory inodes.
1021 *
1022 * However, this operation also marks the target inode as dirty and
1023 * does not write it, so we allocate budget for the target inode
1024 * separately.
1025 */
1026
1027 dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
1028 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
1029 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
1030 new_dentry->d_name.name, new_dir->i_ino);
1031
1032 if (unlink && is_dir) {
1033 err = check_dir_empty(c, new_inode);
1034 if (err)
1035 return err;
1036 }
1037
1038 err = ubifs_budget_space(c, &req);
1039 if (err)
1040 return err;
1041 err = ubifs_budget_space(c, &ino_req);
1042 if (err) {
1043 ubifs_release_budget(c, &req);
1044 return err;
1045 }
1046
1047 lock_3_inodes(old_dir, new_dir, new_inode);
1048
1049 /*
1050 * Like most other Unix systems, set the @i_ctime for inodes on a
1051 * rename.
1052 */
1053 time = ubifs_current_time(old_dir);
1054 old_inode->i_ctime = time;
1055
1056 /* We must adjust parent link count when renaming directories */
1057 if (is_dir) {
1058 if (move) {
1059 /*
1060 * @old_dir loses a link because we are moving
1061 * @old_inode to a different directory.
1062 */
1063 drop_nlink(old_dir);
1064 /*
1065 * @new_dir only gains a link if we are not also
1066 * overwriting an existing directory.
1067 */
1068 if (!unlink)
1069 inc_nlink(new_dir);
1070 } else {
1071 /*
1072 * @old_inode is not moving to a different directory,
1073 * but @old_dir still loses a link if we are
1074 * overwriting an existing directory.
1075 */
1076 if (unlink)
1077 drop_nlink(old_dir);
1078 }
1079 }
1080
1081 old_dir->i_size -= old_sz;
1082 ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1083 old_dir->i_mtime = old_dir->i_ctime = time;
1084 new_dir->i_mtime = new_dir->i_ctime = time;
1085
1086 /*
1087 * And finally, if we unlinked a direntry which happened to have the
1088 * same name as the moved direntry, we have to decrement @i_nlink of
1089 * the unlinked inode and change its ctime.
1090 */
1091 if (unlink) {
1092 /*
1093 * Directories cannot have hard-links, so if this is a
1094 * directory, decrement its @i_nlink twice because an empty
1095 * directory has @i_nlink 2.
1096 */
1097 if (is_dir)
1098 drop_nlink(new_inode);
1099 new_inode->i_ctime = time;
1100 drop_nlink(new_inode);
1101 } else {
1102 new_dir->i_size += new_sz;
1103 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1104 }
1105
1106 /*
1107 * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
1108 * is dirty, because this will be done later on at the end of
1109 * 'ubifs_rename()'.
1110 */
1111 if (IS_SYNC(old_inode)) {
1112 sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
1113 if (unlink && IS_SYNC(new_inode))
1114 sync = 1;
1115 }
1116 err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
1117 sync);
1118 if (err)
1119 goto out_cancel;
1120
1121 unlock_3_inodes(old_dir, new_dir, new_inode);
1122 ubifs_release_budget(c, &req);
1123
1124 mutex_lock(&old_inode_ui->ui_mutex);
1125 release = old_inode_ui->dirty;
1126 mark_inode_dirty_sync(old_inode);
1127 mutex_unlock(&old_inode_ui->ui_mutex);
1128
1129 if (release)
1130 ubifs_release_budget(c, &ino_req);
1131 if (IS_SYNC(old_inode))
1132 err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
1133 return err;
1134
1135out_cancel:
1136 if (unlink) {
1137 if (is_dir)
1138 inc_nlink(new_inode);
1139 inc_nlink(new_inode);
1140 } else {
1141 new_dir->i_size -= new_sz;
1142 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1143 }
1144 old_dir->i_size += old_sz;
1145 ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1146 if (is_dir) {
1147 if (move) {
1148 inc_nlink(old_dir);
1149 if (!unlink)
1150 drop_nlink(new_dir);
1151 } else {
1152 if (unlink)
1153 inc_nlink(old_dir);
1154 }
1155 }
1156 unlock_3_inodes(old_dir, new_dir, new_inode);
1157 ubifs_release_budget(c, &ino_req);
1158 ubifs_release_budget(c, &req);
1159 return err;
1160}
1161
1162int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1163 struct kstat *stat)
1164{
1165 loff_t size;
1166 struct inode *inode = dentry->d_inode;
1167 struct ubifs_inode *ui = ubifs_inode(inode);
1168
1169 mutex_lock(&ui->ui_mutex);
1170 stat->dev = inode->i_sb->s_dev;
1171 stat->ino = inode->i_ino;
1172 stat->mode = inode->i_mode;
1173 stat->nlink = inode->i_nlink;
1174 stat->uid = inode->i_uid;
1175 stat->gid = inode->i_gid;
1176 stat->rdev = inode->i_rdev;
1177 stat->atime = inode->i_atime;
1178 stat->mtime = inode->i_mtime;
1179 stat->ctime = inode->i_ctime;
1180 stat->blksize = UBIFS_BLOCK_SIZE;
1181 stat->size = ui->ui_size;
1182
1183 /*
1184 * Unfortunately, the 'stat()' system call was designed for block
1185 * device based file systems, and it is not appropriate for UBIFS,
1186 * because UBIFS does not have notion of "block". For example, it is
1187 * difficult to tell how many block a directory takes - it actually
1188 * takes less than 300 bytes, but we have to round it to block size,
1189 * which introduces large mistake. This makes utilities like 'du' to
1190 * report completely senseless numbers. This is the reason why UBIFS
1191 * goes the same way as JFFS2 - it reports zero blocks for everything
1192 * but regular files, which makes more sense than reporting completely
1193 * wrong sizes.
1194 */
1195 if (S_ISREG(inode->i_mode)) {
1196 size = ui->xattr_size;
1197 size += stat->size;
1198 size = ALIGN(size, UBIFS_BLOCK_SIZE);
1199 /*
1200 * Note, user-space expects 512-byte blocks count irrespectively
1201 * of what was reported in @stat->size.
1202 */
1203 stat->blocks = size >> 9;
1204 } else
1205 stat->blocks = 0;
1206 mutex_unlock(&ui->ui_mutex);
1207 return 0;
1208}
1209
1210struct inode_operations ubifs_dir_inode_operations = {
1211 .lookup = ubifs_lookup,
1212 .create = ubifs_create,
1213 .link = ubifs_link,
1214 .symlink = ubifs_symlink,
1215 .unlink = ubifs_unlink,
1216 .mkdir = ubifs_mkdir,
1217 .rmdir = ubifs_rmdir,
1218 .mknod = ubifs_mknod,
1219 .rename = ubifs_rename,
1220 .setattr = ubifs_setattr,
1221 .getattr = ubifs_getattr,
1222#ifdef CONFIG_UBIFS_FS_XATTR
1223 .setxattr = ubifs_setxattr,
1224 .getxattr = ubifs_getxattr,
1225 .listxattr = ubifs_listxattr,
1226 .removexattr = ubifs_removexattr,
1227#endif
1228};
1229
1230struct file_operations ubifs_dir_operations = {
1231 .llseek = ubifs_dir_llseek,
1232 .release = ubifs_dir_release,
1233 .read = generic_read_dir,
1234 .readdir = ubifs_readdir,
1235 .fsync = ubifs_fsync,
1236 .unlocked_ioctl = ubifs_ioctl,
1237#ifdef CONFIG_COMPAT
1238 .compat_ioctl = ubifs_compat_ioctl,
1239#endif
1240};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
new file mode 100644
index 000000000000..005a3b854d96
--- /dev/null
+++ b/fs/ubifs/file.c
@@ -0,0 +1,1275 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements VFS file and inode operations of regular files, device
25 * nodes and symlinks as well as address space operations.
26 *
27 * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
28 * page is dirty and is used for budgeting purposes - dirty pages should not be
29 * budgeted. The PG_checked flag is set if full budgeting is required for the
30 * page e.g., when it corresponds to a file hole or it is just beyond the file
31 * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
32 * fail in this function, and the budget is released in 'ubifs_write_end()'. So
33 * the PG_private and PG_checked flags carry the information about how the page
34 * was budgeted, to make it possible to release the budget properly.
35 *
36 * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
37 * we implement. However, this is not true for '->writepage()', which might be
38 * called with 'i_mutex' unlocked. For example, when pdflush is performing
39 * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
40 * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
41 * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
42 * path'. So, in '->writepage()' we are only guaranteed that the page is
43 * locked.
44 *
45 * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
46 * readahead path does not have it locked ("sys_read -> generic_file_aio_read
47 * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
48 * not set as well. However, UBIFS disables readahead.
49 *
50 * This, for example means that there might be 2 concurrent '->writepage()'
51 * calls for the same inode, but different inode dirty pages.
52 */
53
54#include "ubifs.h"
55#include <linux/mount.h>
56
57static int read_block(struct inode *inode, void *addr, unsigned int block,
58 struct ubifs_data_node *dn)
59{
60 struct ubifs_info *c = inode->i_sb->s_fs_info;
61 int err, len, out_len;
62 union ubifs_key key;
63 unsigned int dlen;
64
65 data_key_init(c, &key, inode->i_ino, block);
66 err = ubifs_tnc_lookup(c, &key, dn);
67 if (err) {
68 if (err == -ENOENT)
69 /* Not found, so it must be a hole */
70 memset(addr, 0, UBIFS_BLOCK_SIZE);
71 return err;
72 }
73
74 ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
75
76 len = le32_to_cpu(dn->size);
77 if (len <= 0 || len > UBIFS_BLOCK_SIZE)
78 goto dump;
79
80 dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
81 out_len = UBIFS_BLOCK_SIZE;
82 err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
83 le16_to_cpu(dn->compr_type));
84 if (err || len != out_len)
85 goto dump;
86
87 /*
88 * Data length can be less than a full block, even for blocks that are
89 * not the last in the file (e.g., as a result of making a hole and
90 * appending data). Ensure that the remainder is zeroed out.
91 */
92 if (len < UBIFS_BLOCK_SIZE)
93 memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
94
95 return 0;
96
97dump:
98 ubifs_err("bad data node (block %u, inode %lu)",
99 block, inode->i_ino);
100 dbg_dump_node(c, dn);
101 return -EINVAL;
102}
103
104static int do_readpage(struct page *page)
105{
106 void *addr;
107 int err = 0, i;
108 unsigned int block, beyond;
109 struct ubifs_data_node *dn;
110 struct inode *inode = page->mapping->host;
111 loff_t i_size = i_size_read(inode);
112
113 dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
114 inode->i_ino, page->index, i_size, page->flags);
115 ubifs_assert(!PageChecked(page));
116 ubifs_assert(!PagePrivate(page));
117
118 addr = kmap(page);
119
120 block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
121 beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
122 if (block >= beyond) {
123 /* Reading beyond inode */
124 SetPageChecked(page);
125 memset(addr, 0, PAGE_CACHE_SIZE);
126 goto out;
127 }
128
129 dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
130 if (!dn) {
131 err = -ENOMEM;
132 goto error;
133 }
134
135 i = 0;
136 while (1) {
137 int ret;
138
139 if (block >= beyond) {
140 /* Reading beyond inode */
141 err = -ENOENT;
142 memset(addr, 0, UBIFS_BLOCK_SIZE);
143 } else {
144 ret = read_block(inode, addr, block, dn);
145 if (ret) {
146 err = ret;
147 if (err != -ENOENT)
148 break;
149 }
150 }
151 if (++i >= UBIFS_BLOCKS_PER_PAGE)
152 break;
153 block += 1;
154 addr += UBIFS_BLOCK_SIZE;
155 }
156 if (err) {
157 if (err == -ENOENT) {
158 /* Not found, so it must be a hole */
159 SetPageChecked(page);
160 dbg_gen("hole");
161 goto out_free;
162 }
163 ubifs_err("cannot read page %lu of inode %lu, error %d",
164 page->index, inode->i_ino, err);
165 goto error;
166 }
167
168out_free:
169 kfree(dn);
170out:
171 SetPageUptodate(page);
172 ClearPageError(page);
173 flush_dcache_page(page);
174 kunmap(page);
175 return 0;
176
177error:
178 kfree(dn);
179 ClearPageUptodate(page);
180 SetPageError(page);
181 flush_dcache_page(page);
182 kunmap(page);
183 return err;
184}
185
186/**
187 * release_new_page_budget - release budget of a new page.
188 * @c: UBIFS file-system description object
189 *
190 * This is a helper function which releases budget corresponding to the budget
191 * of one new page of data.
192 */
193static void release_new_page_budget(struct ubifs_info *c)
194{
195 struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
196
197 ubifs_release_budget(c, &req);
198}
199
200/**
201 * release_existing_page_budget - release budget of an existing page.
202 * @c: UBIFS file-system description object
203 *
204 * This is a helper function which releases budget corresponding to the budget
205 * of changing one one page of data which already exists on the flash media.
206 */
207static void release_existing_page_budget(struct ubifs_info *c)
208{
209 struct ubifs_budget_req req = { .dd_growth = c->page_budget};
210
211 ubifs_release_budget(c, &req);
212}
213
214static int write_begin_slow(struct address_space *mapping,
215 loff_t pos, unsigned len, struct page **pagep)
216{
217 struct inode *inode = mapping->host;
218 struct ubifs_info *c = inode->i_sb->s_fs_info;
219 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
220 struct ubifs_budget_req req = { .new_page = 1 };
221 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
222 struct page *page;
223
224 dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
225 inode->i_ino, pos, len, inode->i_size);
226
227 /*
228 * At the slow path we have to budget before locking the page, because
229 * budgeting may force write-back, which would wait on locked pages and
230 * deadlock if we had the page locked. At this point we do not know
231 * anything about the page, so assume that this is a new page which is
232 * written to a hole. This corresponds to largest budget. Later the
233 * budget will be amended if this is not true.
234 */
235 if (appending)
236 /* We are appending data, budget for inode change */
237 req.dirtied_ino = 1;
238
239 err = ubifs_budget_space(c, &req);
240 if (unlikely(err))
241 return err;
242
243 page = __grab_cache_page(mapping, index);
244 if (unlikely(!page)) {
245 ubifs_release_budget(c, &req);
246 return -ENOMEM;
247 }
248
249 if (!PageUptodate(page)) {
250 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
251 SetPageChecked(page);
252 else {
253 err = do_readpage(page);
254 if (err) {
255 unlock_page(page);
256 page_cache_release(page);
257 return err;
258 }
259 }
260
261 SetPageUptodate(page);
262 ClearPageError(page);
263 }
264
265 if (PagePrivate(page))
266 /*
267 * The page is dirty, which means it was budgeted twice:
268 * o first time the budget was allocated by the task which
269 * made the page dirty and set the PG_private flag;
270 * o and then we budgeted for it for the second time at the
271 * very beginning of this function.
272 *
273 * So what we have to do is to release the page budget we
274 * allocated.
275 */
276 release_new_page_budget(c);
277 else if (!PageChecked(page))
278 /*
279 * We are changing a page which already exists on the media.
280 * This means that changing the page does not make the amount
281 * of indexing information larger, and this part of the budget
282 * which we have already acquired may be released.
283 */
284 ubifs_convert_page_budget(c);
285
286 if (appending) {
287 struct ubifs_inode *ui = ubifs_inode(inode);
288
289 /*
290 * 'ubifs_write_end()' is optimized from the fast-path part of
291 * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
292 * if data is appended.
293 */
294 mutex_lock(&ui->ui_mutex);
295 if (ui->dirty)
296 /*
297 * The inode is dirty already, so we may free the
298 * budget we allocated.
299 */
300 ubifs_release_dirty_inode_budget(c, ui);
301 }
302
303 *pagep = page;
304 return 0;
305}
306
307/**
308 * allocate_budget - allocate budget for 'ubifs_write_begin()'.
309 * @c: UBIFS file-system description object
310 * @page: page to allocate budget for
311 * @ui: UBIFS inode object the page belongs to
312 * @appending: non-zero if the page is appended
313 *
314 * This is a helper function for 'ubifs_write_begin()' which allocates budget
315 * for the operation. The budget is allocated differently depending on whether
316 * this is appending, whether the page is dirty or not, and so on. This
317 * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
318 * in case of success and %-ENOSPC in case of failure.
319 */
320static int allocate_budget(struct ubifs_info *c, struct page *page,
321 struct ubifs_inode *ui, int appending)
322{
323 struct ubifs_budget_req req = { .fast = 1 };
324
325 if (PagePrivate(page)) {
326 if (!appending)
327 /*
328 * The page is dirty and we are not appending, which
329 * means no budget is needed at all.
330 */
331 return 0;
332
333 mutex_lock(&ui->ui_mutex);
334 if (ui->dirty)
335 /*
336 * The page is dirty and we are appending, so the inode
337 * has to be marked as dirty. However, it is already
338 * dirty, so we do not need any budget. We may return,
339 * but @ui->ui_mutex hast to be left locked because we
340 * should prevent write-back from flushing the inode
341 * and freeing the budget. The lock will be released in
342 * 'ubifs_write_end()'.
343 */
344 return 0;
345
346 /*
347 * The page is dirty, we are appending, the inode is clean, so
348 * we need to budget the inode change.
349 */
350 req.dirtied_ino = 1;
351 } else {
352 if (PageChecked(page))
353 /*
354 * The page corresponds to a hole and does not
355 * exist on the media. So changing it makes
356 * make the amount of indexing information
357 * larger, and we have to budget for a new
358 * page.
359 */
360 req.new_page = 1;
361 else
362 /*
363 * Not a hole, the change will not add any new
364 * indexing information, budget for page
365 * change.
366 */
367 req.dirtied_page = 1;
368
369 if (appending) {
370 mutex_lock(&ui->ui_mutex);
371 if (!ui->dirty)
372 /*
373 * The inode is clean but we will have to mark
374 * it as dirty because we are appending. This
375 * needs a budget.
376 */
377 req.dirtied_ino = 1;
378 }
379 }
380
381 return ubifs_budget_space(c, &req);
382}
383
384/*
385 * This function is called when a page of data is going to be written. Since
386 * the page of data will not necessarily go to the flash straight away, UBIFS
387 * has to reserve space on the media for it, which is done by means of
388 * budgeting.
389 *
390 * This is the hot-path of the file-system and we are trying to optimize it as
391 * much as possible. For this reasons it is split on 2 parts - slow and fast.
392 *
393 * There many budgeting cases:
394 * o a new page is appended - we have to budget for a new page and for
395 * changing the inode; however, if the inode is already dirty, there is
396 * no need to budget for it;
397 * o an existing clean page is changed - we have budget for it; if the page
398 * does not exist on the media (a hole), we have to budget for a new
399 * page; otherwise, we may budget for changing an existing page; the
400 * difference between these cases is that changing an existing page does
401 * not introduce anything new to the FS indexing information, so it does
402 * not grow, and smaller budget is acquired in this case;
403 * o an existing dirty page is changed - no need to budget at all, because
404 * the page budget has been acquired by earlier, when the page has been
405 * marked dirty.
406 *
407 * UBIFS budgeting sub-system may force write-back if it thinks there is no
408 * space to reserve. This imposes some locking restrictions and makes it
409 * impossible to take into account the above cases, and makes it impossible to
410 * optimize budgeting.
411 *
412 * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
413 * there is a plenty of flash space and the budget will be acquired quickly,
414 * without forcing write-back. The slow path does not make this assumption.
415 */
416static int ubifs_write_begin(struct file *file, struct address_space *mapping,
417 loff_t pos, unsigned len, unsigned flags,
418 struct page **pagep, void **fsdata)
419{
420 struct inode *inode = mapping->host;
421 struct ubifs_info *c = inode->i_sb->s_fs_info;
422 struct ubifs_inode *ui = ubifs_inode(inode);
423 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
424 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
425 struct page *page;
426
427
428 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
429
430 if (unlikely(c->ro_media))
431 return -EROFS;
432
433 /* Try out the fast-path part first */
434 page = __grab_cache_page(mapping, index);
435 if (unlikely(!page))
436 return -ENOMEM;
437
438 if (!PageUptodate(page)) {
439 /* The page is not loaded from the flash */
440 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
441 /*
442 * We change whole page so no need to load it. But we
443 * have to set the @PG_checked flag to make the further
444 * code the page is new. This might be not true, but it
445 * is better to budget more that to read the page from
446 * the media.
447 */
448 SetPageChecked(page);
449 else {
450 err = do_readpage(page);
451 if (err) {
452 unlock_page(page);
453 page_cache_release(page);
454 return err;
455 }
456 }
457
458 SetPageUptodate(page);
459 ClearPageError(page);
460 }
461
462 err = allocate_budget(c, page, ui, appending);
463 if (unlikely(err)) {
464 ubifs_assert(err == -ENOSPC);
465 /*
466 * Budgeting failed which means it would have to force
467 * write-back but didn't, because we set the @fast flag in the
468 * request. Write-back cannot be done now, while we have the
469 * page locked, because it would deadlock. Unlock and free
470 * everything and fall-back to slow-path.
471 */
472 if (appending) {
473 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
474 mutex_unlock(&ui->ui_mutex);
475 }
476 unlock_page(page);
477 page_cache_release(page);
478
479 return write_begin_slow(mapping, pos, len, pagep);
480 }
481
482 /*
483 * Whee, we aquired budgeting quickly - without involving
484 * garbage-collection, committing or forceing write-back. We return
485 * with @ui->ui_mutex locked if we are appending pages, and unlocked
486 * otherwise. This is an optimization (slightly hacky though).
487 */
488 *pagep = page;
489 return 0;
490
491}
492
493/**
494 * cancel_budget - cancel budget.
495 * @c: UBIFS file-system description object
496 * @page: page to cancel budget for
497 * @ui: UBIFS inode object the page belongs to
498 * @appending: non-zero if the page is appended
499 *
500 * This is a helper function for a page write operation. It unlocks the
501 * @ui->ui_mutex in case of appending.
502 */
503static void cancel_budget(struct ubifs_info *c, struct page *page,
504 struct ubifs_inode *ui, int appending)
505{
506 if (appending) {
507 if (!ui->dirty)
508 ubifs_release_dirty_inode_budget(c, ui);
509 mutex_unlock(&ui->ui_mutex);
510 }
511 if (!PagePrivate(page)) {
512 if (PageChecked(page))
513 release_new_page_budget(c);
514 else
515 release_existing_page_budget(c);
516 }
517}
518
519static int ubifs_write_end(struct file *file, struct address_space *mapping,
520 loff_t pos, unsigned len, unsigned copied,
521 struct page *page, void *fsdata)
522{
523 struct inode *inode = mapping->host;
524 struct ubifs_inode *ui = ubifs_inode(inode);
525 struct ubifs_info *c = inode->i_sb->s_fs_info;
526 loff_t end_pos = pos + len;
527 int appending = !!(end_pos > inode->i_size);
528
529 dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
530 inode->i_ino, pos, page->index, len, copied, inode->i_size);
531
532 if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
533 /*
534 * VFS copied less data to the page that it intended and
535 * declared in its '->write_begin()' call via the @len
536 * argument. If the page was not up-to-date, and @len was
537 * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
538 * not load it from the media (for optimization reasons). This
539 * means that part of the page contains garbage. So read the
540 * page now.
541 */
542 dbg_gen("copied %d instead of %d, read page and repeat",
543 copied, len);
544 cancel_budget(c, page, ui, appending);
545
546 /*
547 * Return 0 to force VFS to repeat the whole operation, or the
548 * error code if 'do_readpage()' failes.
549 */
550 copied = do_readpage(page);
551 goto out;
552 }
553
554 if (!PagePrivate(page)) {
555 SetPagePrivate(page);
556 atomic_long_inc(&c->dirty_pg_cnt);
557 __set_page_dirty_nobuffers(page);
558 }
559
560 if (appending) {
561 i_size_write(inode, end_pos);
562 ui->ui_size = end_pos;
563 /*
564 * Note, we do not set @I_DIRTY_PAGES (which means that the
565 * inode has dirty pages), this has been done in
566 * '__set_page_dirty_nobuffers()'.
567 */
568 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
569 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
570 mutex_unlock(&ui->ui_mutex);
571 }
572
573out:
574 unlock_page(page);
575 page_cache_release(page);
576 return copied;
577}
578
579static int ubifs_readpage(struct file *file, struct page *page)
580{
581 do_readpage(page);
582 unlock_page(page);
583 return 0;
584}
585
586static int do_writepage(struct page *page, int len)
587{
588 int err = 0, i, blen;
589 unsigned int block;
590 void *addr;
591 union ubifs_key key;
592 struct inode *inode = page->mapping->host;
593 struct ubifs_info *c = inode->i_sb->s_fs_info;
594
595#ifdef UBIFS_DEBUG
596 spin_lock(&ui->ui_lock);
597 ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
598 spin_unlock(&ui->ui_lock);
599#endif
600
601 /* Update radix tree tags */
602 set_page_writeback(page);
603
604 addr = kmap(page);
605 block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
606 i = 0;
607 while (len) {
608 blen = min_t(int, len, UBIFS_BLOCK_SIZE);
609 data_key_init(c, &key, inode->i_ino, block);
610 err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
611 if (err)
612 break;
613 if (++i >= UBIFS_BLOCKS_PER_PAGE)
614 break;
615 block += 1;
616 addr += blen;
617 len -= blen;
618 }
619 if (err) {
620 SetPageError(page);
621 ubifs_err("cannot write page %lu of inode %lu, error %d",
622 page->index, inode->i_ino, err);
623 ubifs_ro_mode(c, err);
624 }
625
626 ubifs_assert(PagePrivate(page));
627 if (PageChecked(page))
628 release_new_page_budget(c);
629 else
630 release_existing_page_budget(c);
631
632 atomic_long_dec(&c->dirty_pg_cnt);
633 ClearPagePrivate(page);
634 ClearPageChecked(page);
635
636 kunmap(page);
637 unlock_page(page);
638 end_page_writeback(page);
639 return err;
640}
641
642/*
643 * When writing-back dirty inodes, VFS first writes-back pages belonging to the
644 * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
645 * situation when a we have an inode with size 0, then a megabyte of data is
646 * appended to the inode, then write-back starts and flushes some amount of the
647 * dirty pages, the journal becomes full, commit happens and finishes, and then
648 * an unclean reboot happens. When the file system is mounted next time, the
649 * inode size would still be 0, but there would be many pages which are beyond
650 * the inode size, they would be indexed and consume flash space. Because the
651 * journal has been committed, the replay would not be able to detect this
652 * situation and correct the inode size. This means UBIFS would have to scan
653 * whole index and correct all inode sizes, which is long an unacceptable.
654 *
655 * To prevent situations like this, UBIFS writes pages back only if they are
656 * within last synchronized inode size, i.e. the the size which has been
657 * written to the flash media last time. Otherwise, UBIFS forces inode
658 * write-back, thus making sure the on-flash inode contains current inode size,
659 * and then keeps writing pages back.
660 *
661 * Some locking issues explanation. 'ubifs_writepage()' first is called with
662 * the page locked, and it locks @ui_mutex. However, write-back does take inode
663 * @i_mutex, which means other VFS operations may be run on this inode at the
664 * same time. And the problematic one is truncation to smaller size, from where
665 * we have to call 'vmtruncate()', which first changes @inode->i_size, then
666 * drops the truncated pages. And while dropping the pages, it takes the page
667 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
668 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
669 * means that @inode->i_size is changed while @ui_mutex is unlocked.
670 *
671 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
672 * inode size. How do we do this if @inode->i_size may became smaller while we
673 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
674 * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
675 * internally and updates it under @ui_mutex.
676 *
677 * Q: why we do not worry that if we race with truncation, we may end up with a
678 * situation when the inode is truncated while we are in the middle of
679 * 'do_writepage()', so we do write beyond inode size?
680 * A: If we are in the middle of 'do_writepage()', truncation would be locked
681 * on the page lock and it would not write the truncated inode node to the
682 * journal before we have finished.
683 */
684static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
685{
686 struct inode *inode = page->mapping->host;
687 struct ubifs_inode *ui = ubifs_inode(inode);
688 loff_t i_size = i_size_read(inode), synced_i_size;
689 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
690 int err, len = i_size & (PAGE_CACHE_SIZE - 1);
691 void *kaddr;
692
693 dbg_gen("ino %lu, pg %lu, pg flags %#lx",
694 inode->i_ino, page->index, page->flags);
695 ubifs_assert(PagePrivate(page));
696
697 /* Is the page fully outside @i_size? (truncate in progress) */
698 if (page->index > end_index || (page->index == end_index && !len)) {
699 err = 0;
700 goto out_unlock;
701 }
702
703 spin_lock(&ui->ui_lock);
704 synced_i_size = ui->synced_i_size;
705 spin_unlock(&ui->ui_lock);
706
707 /* Is the page fully inside @i_size? */
708 if (page->index < end_index) {
709 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
710 err = inode->i_sb->s_op->write_inode(inode, 1);
711 if (err)
712 goto out_unlock;
713 /*
714 * The inode has been written, but the write-buffer has
715 * not been synchronized, so in case of an unclean
716 * reboot we may end up with some pages beyond inode
717 * size, but they would be in the journal (because
718 * commit flushes write buffers) and recovery would deal
719 * with this.
720 */
721 }
722 return do_writepage(page, PAGE_CACHE_SIZE);
723 }
724
725 /*
726 * The page straddles @i_size. It must be zeroed out on each and every
727 * writepage invocation because it may be mmapped. "A file is mapped
728 * in multiples of the page size. For a file that is not a multiple of
729 * the page size, the remaining memory is zeroed when mapped, and
730 * writes to that region are not written out to the file."
731 */
732 kaddr = kmap_atomic(page, KM_USER0);
733 memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
734 flush_dcache_page(page);
735 kunmap_atomic(kaddr, KM_USER0);
736
737 if (i_size > synced_i_size) {
738 err = inode->i_sb->s_op->write_inode(inode, 1);
739 if (err)
740 goto out_unlock;
741 }
742
743 return do_writepage(page, len);
744
745out_unlock:
746 unlock_page(page);
747 return err;
748}
749
750/**
751 * do_attr_changes - change inode attributes.
752 * @inode: inode to change attributes for
753 * @attr: describes attributes to change
754 */
755static void do_attr_changes(struct inode *inode, const struct iattr *attr)
756{
757 if (attr->ia_valid & ATTR_UID)
758 inode->i_uid = attr->ia_uid;
759 if (attr->ia_valid & ATTR_GID)
760 inode->i_gid = attr->ia_gid;
761 if (attr->ia_valid & ATTR_ATIME)
762 inode->i_atime = timespec_trunc(attr->ia_atime,
763 inode->i_sb->s_time_gran);
764 if (attr->ia_valid & ATTR_MTIME)
765 inode->i_mtime = timespec_trunc(attr->ia_mtime,
766 inode->i_sb->s_time_gran);
767 if (attr->ia_valid & ATTR_CTIME)
768 inode->i_ctime = timespec_trunc(attr->ia_ctime,
769 inode->i_sb->s_time_gran);
770 if (attr->ia_valid & ATTR_MODE) {
771 umode_t mode = attr->ia_mode;
772
773 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
774 mode &= ~S_ISGID;
775 inode->i_mode = mode;
776 }
777}
778
779/**
780 * do_truncation - truncate an inode.
781 * @c: UBIFS file-system description object
782 * @inode: inode to truncate
783 * @attr: inode attribute changes description
784 *
785 * This function implements VFS '->setattr()' call when the inode is truncated
786 * to a smaller size. Returns zero in case of success and a negative error code
787 * in case of failure.
788 */
789static int do_truncation(struct ubifs_info *c, struct inode *inode,
790 const struct iattr *attr)
791{
792 int err;
793 struct ubifs_budget_req req;
794 loff_t old_size = inode->i_size, new_size = attr->ia_size;
795 int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
796 struct ubifs_inode *ui = ubifs_inode(inode);
797
798 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
799 memset(&req, 0, sizeof(struct ubifs_budget_req));
800
801 /*
802 * If this is truncation to a smaller size, and we do not truncate on a
803 * block boundary, budget for changing one data block, because the last
804 * block will be re-written.
805 */
806 if (new_size & (UBIFS_BLOCK_SIZE - 1))
807 req.dirtied_page = 1;
808
809 req.dirtied_ino = 1;
810 /* A funny way to budget for truncation node */
811 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
812 err = ubifs_budget_space(c, &req);
813 if (err)
814 return err;
815
816 err = vmtruncate(inode, new_size);
817 if (err)
818 goto out_budg;
819
820 if (offset) {
821 pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
822 struct page *page;
823
824 page = find_lock_page(inode->i_mapping, index);
825 if (page) {
826 if (PageDirty(page)) {
827 /*
828 * 'ubifs_jnl_truncate()' will try to truncate
829 * the last data node, but it contains
830 * out-of-date data because the page is dirty.
831 * Write the page now, so that
832 * 'ubifs_jnl_truncate()' will see an already
833 * truncated (and up to date) data node.
834 */
835 ubifs_assert(PagePrivate(page));
836
837 clear_page_dirty_for_io(page);
838 if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
839 offset = new_size &
840 (PAGE_CACHE_SIZE - 1);
841 err = do_writepage(page, offset);
842 page_cache_release(page);
843 if (err)
844 goto out_budg;
845 /*
846 * We could now tell 'ubifs_jnl_truncate()' not
847 * to read the last block.
848 */
849 } else {
850 /*
851 * We could 'kmap()' the page and pass the data
852 * to 'ubifs_jnl_truncate()' to save it from
853 * having to read it.
854 */
855 unlock_page(page);
856 page_cache_release(page);
857 }
858 }
859 }
860
861 mutex_lock(&ui->ui_mutex);
862 ui->ui_size = inode->i_size;
863 /* Truncation changes inode [mc]time */
864 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
865 /* The other attributes may be changed at the same time as well */
866 do_attr_changes(inode, attr);
867
868 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
869 mutex_unlock(&ui->ui_mutex);
870out_budg:
871 ubifs_release_budget(c, &req);
872 return err;
873}
874
875/**
876 * do_setattr - change inode attributes.
877 * @c: UBIFS file-system description object
878 * @inode: inode to change attributes for
879 * @attr: inode attribute changes description
880 *
881 * This function implements VFS '->setattr()' call for all cases except
882 * truncations to smaller size. Returns zero in case of success and a negative
883 * error code in case of failure.
884 */
885static int do_setattr(struct ubifs_info *c, struct inode *inode,
886 const struct iattr *attr)
887{
888 int err, release;
889 loff_t new_size = attr->ia_size;
890 struct ubifs_inode *ui = ubifs_inode(inode);
891 struct ubifs_budget_req req = { .dirtied_ino = 1,
892 .dirtied_ino_d = ui->data_len };
893
894 err = ubifs_budget_space(c, &req);
895 if (err)
896 return err;
897
898 if (attr->ia_valid & ATTR_SIZE) {
899 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
900 err = vmtruncate(inode, new_size);
901 if (err)
902 goto out;
903 }
904
905 mutex_lock(&ui->ui_mutex);
906 if (attr->ia_valid & ATTR_SIZE) {
907 /* Truncation changes inode [mc]time */
908 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
909 /* 'vmtruncate()' changed @i_size, update @ui_size */
910 ui->ui_size = inode->i_size;
911 }
912
913 do_attr_changes(inode, attr);
914
915 release = ui->dirty;
916 if (attr->ia_valid & ATTR_SIZE)
917 /*
918 * Inode length changed, so we have to make sure
919 * @I_DIRTY_DATASYNC is set.
920 */
921 __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
922 else
923 mark_inode_dirty_sync(inode);
924 mutex_unlock(&ui->ui_mutex);
925
926 if (release)
927 ubifs_release_budget(c, &req);
928 if (IS_SYNC(inode))
929 err = inode->i_sb->s_op->write_inode(inode, 1);
930 return err;
931
932out:
933 ubifs_release_budget(c, &req);
934 return err;
935}
936
937int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
938{
939 int err;
940 struct inode *inode = dentry->d_inode;
941 struct ubifs_info *c = inode->i_sb->s_fs_info;
942
943 dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
944 err = inode_change_ok(inode, attr);
945 if (err)
946 return err;
947
948 err = dbg_check_synced_i_size(inode);
949 if (err)
950 return err;
951
952 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
953 /* Truncation to a smaller size */
954 err = do_truncation(c, inode, attr);
955 else
956 err = do_setattr(c, inode, attr);
957
958 return err;
959}
960
961static void ubifs_invalidatepage(struct page *page, unsigned long offset)
962{
963 struct inode *inode = page->mapping->host;
964 struct ubifs_info *c = inode->i_sb->s_fs_info;
965
966 ubifs_assert(PagePrivate(page));
967 if (offset)
968 /* Partial page remains dirty */
969 return;
970
971 if (PageChecked(page))
972 release_new_page_budget(c);
973 else
974 release_existing_page_budget(c);
975
976 atomic_long_dec(&c->dirty_pg_cnt);
977 ClearPagePrivate(page);
978 ClearPageChecked(page);
979}
980
981static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
982{
983 struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
984
985 nd_set_link(nd, ui->data);
986 return NULL;
987}
988
989int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
990{
991 struct inode *inode = dentry->d_inode;
992 struct ubifs_info *c = inode->i_sb->s_fs_info;
993 int err;
994
995 dbg_gen("syncing inode %lu", inode->i_ino);
996
997 /*
998 * VFS has already synchronized dirty pages for this inode. Synchronize
999 * the inode unless this is a 'datasync()' call.
1000 */
1001 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
1002 err = inode->i_sb->s_op->write_inode(inode, 1);
1003 if (err)
1004 return err;
1005 }
1006
1007 /*
1008 * Nodes related to this inode may still sit in a write-buffer. Flush
1009 * them.
1010 */
1011 err = ubifs_sync_wbufs_by_inode(c, inode);
1012 if (err)
1013 return err;
1014
1015 return 0;
1016}
1017
1018/**
1019 * mctime_update_needed - check if mtime or ctime update is needed.
1020 * @inode: the inode to do the check for
1021 * @now: current time
1022 *
1023 * This helper function checks if the inode mtime/ctime should be updated or
1024 * not. If current values of the time-stamps are within the UBIFS inode time
1025 * granularity, they are not updated. This is an optimization.
1026 */
1027static inline int mctime_update_needed(const struct inode *inode,
1028 const struct timespec *now)
1029{
1030 if (!timespec_equal(&inode->i_mtime, now) ||
1031 !timespec_equal(&inode->i_ctime, now))
1032 return 1;
1033 return 0;
1034}
1035
1036/**
1037 * update_ctime - update mtime and ctime of an inode.
1038 * @c: UBIFS file-system description object
1039 * @inode: inode to update
1040 *
1041 * This function updates mtime and ctime of the inode if it is not equivalent to
1042 * current time. Returns zero in case of success and a negative error code in
1043 * case of failure.
1044 */
1045static int update_mctime(struct ubifs_info *c, struct inode *inode)
1046{
1047 struct timespec now = ubifs_current_time(inode);
1048 struct ubifs_inode *ui = ubifs_inode(inode);
1049
1050 if (mctime_update_needed(inode, &now)) {
1051 int err, release;
1052 struct ubifs_budget_req req = { .dirtied_ino = 1,
1053 .dirtied_ino_d = ui->data_len };
1054
1055 err = ubifs_budget_space(c, &req);
1056 if (err)
1057 return err;
1058
1059 mutex_lock(&ui->ui_mutex);
1060 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1061 release = ui->dirty;
1062 mark_inode_dirty_sync(inode);
1063 mutex_unlock(&ui->ui_mutex);
1064 if (release)
1065 ubifs_release_budget(c, &req);
1066 }
1067
1068 return 0;
1069}
1070
1071static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
1072 unsigned long nr_segs, loff_t pos)
1073{
1074 int err;
1075 ssize_t ret;
1076 struct inode *inode = iocb->ki_filp->f_mapping->host;
1077 struct ubifs_info *c = inode->i_sb->s_fs_info;
1078
1079 err = update_mctime(c, inode);
1080 if (err)
1081 return err;
1082
1083 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
1084 if (ret < 0)
1085 return ret;
1086
1087 if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
1088 err = ubifs_sync_wbufs_by_inode(c, inode);
1089 if (err)
1090 return err;
1091 }
1092
1093 return ret;
1094}
1095
1096static int ubifs_set_page_dirty(struct page *page)
1097{
1098 int ret;
1099
1100 ret = __set_page_dirty_nobuffers(page);
1101 /*
1102 * An attempt to dirty a page without budgeting for it - should not
1103 * happen.
1104 */
1105 ubifs_assert(ret == 0);
1106 return ret;
1107}
1108
1109static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
1110{
1111 /*
1112 * An attempt to release a dirty page without budgeting for it - should
1113 * not happen.
1114 */
1115 if (PageWriteback(page))
1116 return 0;
1117 ubifs_assert(PagePrivate(page));
1118 ubifs_assert(0);
1119 ClearPagePrivate(page);
1120 ClearPageChecked(page);
1121 return 1;
1122}
1123
1124/*
1125 * mmap()d file has taken write protection fault and is being made
1126 * writable. UBIFS must ensure page is budgeted for.
1127 */
1128static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1129{
1130 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1131 struct ubifs_info *c = inode->i_sb->s_fs_info;
1132 struct timespec now = ubifs_current_time(inode);
1133 struct ubifs_budget_req req = { .new_page = 1 };
1134 int err, update_time;
1135
1136 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
1137 i_size_read(inode));
1138 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
1139
1140 if (unlikely(c->ro_media))
1141 return -EROFS;
1142
1143 /*
1144 * We have not locked @page so far so we may budget for changing the
1145 * page. Note, we cannot do this after we locked the page, because
1146 * budgeting may cause write-back which would cause deadlock.
1147 *
1148 * At the moment we do not know whether the page is dirty or not, so we
1149 * assume that it is not and budget for a new page. We could look at
1150 * the @PG_private flag and figure this out, but we may race with write
1151 * back and the page state may change by the time we lock it, so this
1152 * would need additional care. We do not bother with this at the
1153 * moment, although it might be good idea to do. Instead, we allocate
1154 * budget for a new page and amend it later on if the page was in fact
1155 * dirty.
1156 *
1157 * The budgeting-related logic of this function is similar to what we
1158 * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
1159 * for more comments.
1160 */
1161 update_time = mctime_update_needed(inode, &now);
1162 if (update_time)
1163 /*
1164 * We have to change inode time stamp which requires extra
1165 * budgeting.
1166 */
1167 req.dirtied_ino = 1;
1168
1169 err = ubifs_budget_space(c, &req);
1170 if (unlikely(err)) {
1171 if (err == -ENOSPC)
1172 ubifs_warn("out of space for mmapped file "
1173 "(inode number %lu)", inode->i_ino);
1174 return err;
1175 }
1176
1177 lock_page(page);
1178 if (unlikely(page->mapping != inode->i_mapping ||
1179 page_offset(page) > i_size_read(inode))) {
1180 /* Page got truncated out from underneath us */
1181 err = -EINVAL;
1182 goto out_unlock;
1183 }
1184
1185 if (PagePrivate(page))
1186 release_new_page_budget(c);
1187 else {
1188 if (!PageChecked(page))
1189 ubifs_convert_page_budget(c);
1190 SetPagePrivate(page);
1191 atomic_long_inc(&c->dirty_pg_cnt);
1192 __set_page_dirty_nobuffers(page);
1193 }
1194
1195 if (update_time) {
1196 int release;
1197 struct ubifs_inode *ui = ubifs_inode(inode);
1198
1199 mutex_lock(&ui->ui_mutex);
1200 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1201 release = ui->dirty;
1202 mark_inode_dirty_sync(inode);
1203 mutex_unlock(&ui->ui_mutex);
1204 if (release)
1205 ubifs_release_dirty_inode_budget(c, ui);
1206 }
1207
1208 unlock_page(page);
1209 return 0;
1210
1211out_unlock:
1212 unlock_page(page);
1213 ubifs_release_budget(c, &req);
1214 return err;
1215}
1216
1217static struct vm_operations_struct ubifs_file_vm_ops = {
1218 .fault = filemap_fault,
1219 .page_mkwrite = ubifs_vm_page_mkwrite,
1220};
1221
1222static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1223{
1224 int err;
1225
1226 /* 'generic_file_mmap()' takes care of NOMMU case */
1227 err = generic_file_mmap(file, vma);
1228 if (err)
1229 return err;
1230 vma->vm_ops = &ubifs_file_vm_ops;
1231 return 0;
1232}
1233
1234struct address_space_operations ubifs_file_address_operations = {
1235 .readpage = ubifs_readpage,
1236 .writepage = ubifs_writepage,
1237 .write_begin = ubifs_write_begin,
1238 .write_end = ubifs_write_end,
1239 .invalidatepage = ubifs_invalidatepage,
1240 .set_page_dirty = ubifs_set_page_dirty,
1241 .releasepage = ubifs_releasepage,
1242};
1243
1244struct inode_operations ubifs_file_inode_operations = {
1245 .setattr = ubifs_setattr,
1246 .getattr = ubifs_getattr,
1247#ifdef CONFIG_UBIFS_FS_XATTR
1248 .setxattr = ubifs_setxattr,
1249 .getxattr = ubifs_getxattr,
1250 .listxattr = ubifs_listxattr,
1251 .removexattr = ubifs_removexattr,
1252#endif
1253};
1254
1255struct inode_operations ubifs_symlink_inode_operations = {
1256 .readlink = generic_readlink,
1257 .follow_link = ubifs_follow_link,
1258 .setattr = ubifs_setattr,
1259 .getattr = ubifs_getattr,
1260};
1261
1262struct file_operations ubifs_file_operations = {
1263 .llseek = generic_file_llseek,
1264 .read = do_sync_read,
1265 .write = do_sync_write,
1266 .aio_read = generic_file_aio_read,
1267 .aio_write = ubifs_aio_write,
1268 .mmap = ubifs_file_mmap,
1269 .fsync = ubifs_fsync,
1270 .unlocked_ioctl = ubifs_ioctl,
1271 .splice_read = generic_file_splice_read,
1272#ifdef CONFIG_COMPAT
1273 .compat_ioctl = ubifs_compat_ioctl,
1274#endif
1275};
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
new file mode 100644
index 000000000000..10394c548367
--- /dev/null
+++ b/fs/ubifs/find.c
@@ -0,0 +1,975 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file contains functions for finding LEBs for various purposes e.g.
25 * garbage collection. In general, lprops category heaps and lists are used
26 * for fast access, falling back on scanning the LPT as a last resort.
27 */
28
29#include <linux/sort.h>
30#include "ubifs.h"
31
32/**
33 * struct scan_data - data provided to scan callback functions
34 * @min_space: minimum number of bytes for which to scan
35 * @pick_free: whether it is OK to scan for empty LEBs
36 * @lnum: LEB number found is returned here
37 * @exclude_index: whether to exclude index LEBs
38 */
39struct scan_data {
40 int min_space;
41 int pick_free;
42 int lnum;
43 int exclude_index;
44};
45
46/**
47 * valuable - determine whether LEB properties are valuable.
48 * @c: the UBIFS file-system description object
49 * @lprops: LEB properties
50 *
51 * This function return %1 if the LEB properties should be added to the LEB
52 * properties tree in memory. Otherwise %0 is returned.
53 */
54static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
55{
56 int n, cat = lprops->flags & LPROPS_CAT_MASK;
57 struct ubifs_lpt_heap *heap;
58
59 switch (cat) {
60 case LPROPS_DIRTY:
61 case LPROPS_DIRTY_IDX:
62 case LPROPS_FREE:
63 heap = &c->lpt_heap[cat - 1];
64 if (heap->cnt < heap->max_cnt)
65 return 1;
66 if (lprops->free + lprops->dirty >= c->dark_wm)
67 return 1;
68 return 0;
69 case LPROPS_EMPTY:
70 n = c->lst.empty_lebs + c->freeable_cnt -
71 c->lst.taken_empty_lebs;
72 if (n < c->lsave_cnt)
73 return 1;
74 return 0;
75 case LPROPS_FREEABLE:
76 return 1;
77 case LPROPS_FRDI_IDX:
78 return 1;
79 }
80 return 0;
81}
82
83/**
84 * scan_for_dirty_cb - dirty space scan callback.
85 * @c: the UBIFS file-system description object
86 * @lprops: LEB properties to scan
87 * @in_tree: whether the LEB properties are in main memory
88 * @data: information passed to and from the caller of the scan
89 *
90 * This function returns a code that indicates whether the scan should continue
91 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
92 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
93 * (%LPT_SCAN_STOP).
94 */
95static int scan_for_dirty_cb(struct ubifs_info *c,
96 const struct ubifs_lprops *lprops, int in_tree,
97 struct scan_data *data)
98{
99 int ret = LPT_SCAN_CONTINUE;
100
101 /* Exclude LEBs that are currently in use */
102 if (lprops->flags & LPROPS_TAKEN)
103 return LPT_SCAN_CONTINUE;
104 /* Determine whether to add these LEB properties to the tree */
105 if (!in_tree && valuable(c, lprops))
106 ret |= LPT_SCAN_ADD;
107 /* Exclude LEBs with too little space */
108 if (lprops->free + lprops->dirty < data->min_space)
109 return ret;
110 /* If specified, exclude index LEBs */
111 if (data->exclude_index && lprops->flags & LPROPS_INDEX)
112 return ret;
113 /* If specified, exclude empty or freeable LEBs */
114 if (lprops->free + lprops->dirty == c->leb_size) {
115 if (!data->pick_free)
116 return ret;
117 /* Exclude LEBs with too little dirty space (unless it is empty) */
118 } else if (lprops->dirty < c->dead_wm)
119 return ret;
120 /* Finally we found space */
121 data->lnum = lprops->lnum;
122 return LPT_SCAN_ADD | LPT_SCAN_STOP;
123}
124
125/**
126 * scan_for_dirty - find a data LEB with free space.
127 * @c: the UBIFS file-system description object
128 * @min_space: minimum amount free plus dirty space the returned LEB has to
129 * have
130 * @pick_free: if it is OK to return a free or freeable LEB
131 * @exclude_index: whether to exclude index LEBs
132 *
133 * This function returns a pointer to the LEB properties found or a negative
134 * error code.
135 */
136static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
137 int min_space, int pick_free,
138 int exclude_index)
139{
140 const struct ubifs_lprops *lprops;
141 struct ubifs_lpt_heap *heap;
142 struct scan_data data;
143 int err, i;
144
145 /* There may be an LEB with enough dirty space on the free heap */
146 heap = &c->lpt_heap[LPROPS_FREE - 1];
147 for (i = 0; i < heap->cnt; i++) {
148 lprops = heap->arr[i];
149 if (lprops->free + lprops->dirty < min_space)
150 continue;
151 if (lprops->dirty < c->dead_wm)
152 continue;
153 return lprops;
154 }
155 /*
156 * A LEB may have fallen off of the bottom of the dirty heap, and ended
157 * up as uncategorized even though it has enough dirty space for us now,
158 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
159 * can end up as uncategorized because they are kept on lists not
160 * finite-sized heaps.
161 */
162 list_for_each_entry(lprops, &c->uncat_list, list) {
163 if (lprops->flags & LPROPS_TAKEN)
164 continue;
165 if (lprops->free + lprops->dirty < min_space)
166 continue;
167 if (exclude_index && (lprops->flags & LPROPS_INDEX))
168 continue;
169 if (lprops->dirty < c->dead_wm)
170 continue;
171 return lprops;
172 }
173 /* We have looked everywhere in main memory, now scan the flash */
174 if (c->pnodes_have >= c->pnode_cnt)
175 /* All pnodes are in memory, so skip scan */
176 return ERR_PTR(-ENOSPC);
177 data.min_space = min_space;
178 data.pick_free = pick_free;
179 data.lnum = -1;
180 data.exclude_index = exclude_index;
181 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
182 (ubifs_lpt_scan_callback)scan_for_dirty_cb,
183 &data);
184 if (err)
185 return ERR_PTR(err);
186 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
187 c->lscan_lnum = data.lnum;
188 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
189 if (IS_ERR(lprops))
190 return lprops;
191 ubifs_assert(lprops->lnum == data.lnum);
192 ubifs_assert(lprops->free + lprops->dirty >= min_space);
193 ubifs_assert(lprops->dirty >= c->dead_wm ||
194 (pick_free &&
195 lprops->free + lprops->dirty == c->leb_size));
196 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
197 ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
198 return lprops;
199}
200
201/**
202 * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
203 * @c: the UBIFS file-system description object
204 * @ret_lp: LEB properties are returned here on exit
205 * @min_space: minimum amount free plus dirty space the returned LEB has to
206 * have
207 * @pick_free: controls whether it is OK to pick empty or index LEBs
208 *
209 * This function tries to find a dirty logical eraseblock which has at least
210 * @min_space free and dirty space. It prefers to take an LEB from the dirty or
211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
212 * or do not have an LEB which satisfies the @min_space criteria.
213 *
214 * Note:
215 * o LEBs which have less than dead watermark of dirty space are never picked
216 * by this function;
217 *
218 * Returns zero and the LEB properties of
219 * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
220 * negative error code in case of other failures. The returned LEB is marked as
221 * "taken".
222 *
223 * The additional @pick_free argument controls if this function has to return a
224 * free or freeable LEB if one is present. For example, GC must to set it to %1,
225 * when called from the journal space reservation function, because the
226 * appearance of free space may coincide with the loss of enough dirty space
227 * for GC to succeed anyway.
228 *
229 * In contrast, if the Garbage Collector is called from budgeting, it should
230 * just make free space, not return LEBs which are already free or freeable.
231 *
232 * In addition @pick_free is set to %2 by the recovery process in order to
233 * recover gc_lnum in which case an index LEB must not be returned.
234 */
235int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
236 int min_space, int pick_free)
237{
238 int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
239 const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
240 struct ubifs_lpt_heap *heap, *idx_heap;
241
242 ubifs_get_lprops(c);
243
244 if (pick_free) {
245 int lebs, rsvd_idx_lebs = 0;
246
247 spin_lock(&c->space_lock);
248 lebs = c->lst.empty_lebs;
249 lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
250
251 /*
252 * Note, the index may consume more LEBs than have been reserved
253 * for it. It is OK because it might be consolidated by GC.
254 * But if the index takes fewer LEBs than it is reserved for it,
255 * this function must avoid picking those reserved LEBs.
256 */
257 if (c->min_idx_lebs >= c->lst.idx_lebs) {
258 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
259 exclude_index = 1;
260 }
261 spin_unlock(&c->space_lock);
262
263 /* Check if there are enough free LEBs for the index */
264 if (rsvd_idx_lebs < lebs) {
265 /* OK, try to find an empty LEB */
266 lp = ubifs_fast_find_empty(c);
267 if (lp)
268 goto found;
269
270 /* Or a freeable LEB */
271 lp = ubifs_fast_find_freeable(c);
272 if (lp)
273 goto found;
274 } else
275 /*
276 * We cannot pick free/freeable LEBs in the below code.
277 */
278 pick_free = 0;
279 } else {
280 spin_lock(&c->space_lock);
281 exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
282 spin_unlock(&c->space_lock);
283 }
284
285 /* Look on the dirty and dirty index heaps */
286 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
287 idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
288
289 if (idx_heap->cnt && !exclude_index) {
290 idx_lp = idx_heap->arr[0];
291 sum = idx_lp->free + idx_lp->dirty;
292 /*
293 * Since we reserve twice as more space for the index than it
294 * actually takes, it does not make sense to pick indexing LEBs
295 * with less than half LEB of dirty space.
296 */
297 if (sum < min_space || sum < c->half_leb_size)
298 idx_lp = NULL;
299 }
300
301 if (heap->cnt) {
302 lp = heap->arr[0];
303 if (lp->dirty + lp->free < min_space)
304 lp = NULL;
305 }
306
307 /* Pick the LEB with most space */
308 if (idx_lp && lp) {
309 if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
310 lp = idx_lp;
311 } else if (idx_lp && !lp)
312 lp = idx_lp;
313
314 if (lp) {
315 ubifs_assert(lp->dirty >= c->dead_wm);
316 goto found;
317 }
318
319 /* Did not find a dirty LEB on the dirty heaps, have to scan */
320 dbg_find("scanning LPT for a dirty LEB");
321 lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
322 if (IS_ERR(lp)) {
323 err = PTR_ERR(lp);
324 goto out;
325 }
326 ubifs_assert(lp->dirty >= c->dead_wm ||
327 (pick_free && lp->free + lp->dirty == c->leb_size));
328
329found:
330 dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
331 lp->lnum, lp->free, lp->dirty, lp->flags);
332
333 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
334 lp->flags | LPROPS_TAKEN, 0);
335 if (IS_ERR(lp)) {
336 err = PTR_ERR(lp);
337 goto out;
338 }
339
340 memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
341
342out:
343 ubifs_release_lprops(c);
344 return err;
345}
346
347/**
348 * scan_for_free_cb - free space scan callback.
349 * @c: the UBIFS file-system description object
350 * @lprops: LEB properties to scan
351 * @in_tree: whether the LEB properties are in main memory
352 * @data: information passed to and from the caller of the scan
353 *
354 * This function returns a code that indicates whether the scan should continue
355 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
356 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
357 * (%LPT_SCAN_STOP).
358 */
359static int scan_for_free_cb(struct ubifs_info *c,
360 const struct ubifs_lprops *lprops, int in_tree,
361 struct scan_data *data)
362{
363 int ret = LPT_SCAN_CONTINUE;
364
365 /* Exclude LEBs that are currently in use */
366 if (lprops->flags & LPROPS_TAKEN)
367 return LPT_SCAN_CONTINUE;
368 /* Determine whether to add these LEB properties to the tree */
369 if (!in_tree && valuable(c, lprops))
370 ret |= LPT_SCAN_ADD;
371 /* Exclude index LEBs */
372 if (lprops->flags & LPROPS_INDEX)
373 return ret;
374 /* Exclude LEBs with too little space */
375 if (lprops->free < data->min_space)
376 return ret;
377 /* If specified, exclude empty LEBs */
378 if (!data->pick_free && lprops->free == c->leb_size)
379 return ret;
380 /*
381 * LEBs that have only free and dirty space must not be allocated
382 * because they may have been unmapped already or they may have data
383 * that is obsolete only because of nodes that are still sitting in a
384 * wbuf.
385 */
386 if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
387 return ret;
388 /* Finally we found space */
389 data->lnum = lprops->lnum;
390 return LPT_SCAN_ADD | LPT_SCAN_STOP;
391}
392
393/**
394 * do_find_free_space - find a data LEB with free space.
395 * @c: the UBIFS file-system description object
396 * @min_space: minimum amount of free space required
397 * @pick_free: whether it is OK to scan for empty LEBs
398 * @squeeze: whether to try to find space in a non-empty LEB first
399 *
400 * This function returns a pointer to the LEB properties found or a negative
401 * error code.
402 */
403static
404const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
405 int min_space, int pick_free,
406 int squeeze)
407{
408 const struct ubifs_lprops *lprops;
409 struct ubifs_lpt_heap *heap;
410 struct scan_data data;
411 int err, i;
412
413 if (squeeze) {
414 lprops = ubifs_fast_find_free(c);
415 if (lprops && lprops->free >= min_space)
416 return lprops;
417 }
418 if (pick_free) {
419 lprops = ubifs_fast_find_empty(c);
420 if (lprops)
421 return lprops;
422 }
423 if (!squeeze) {
424 lprops = ubifs_fast_find_free(c);
425 if (lprops && lprops->free >= min_space)
426 return lprops;
427 }
428 /* There may be an LEB with enough free space on the dirty heap */
429 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
430 for (i = 0; i < heap->cnt; i++) {
431 lprops = heap->arr[i];
432 if (lprops->free >= min_space)
433 return lprops;
434 }
435 /*
436 * A LEB may have fallen off of the bottom of the free heap, and ended
437 * up as uncategorized even though it has enough free space for us now,
438 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
439 * can end up as uncategorized because they are kept on lists not
440 * finite-sized heaps.
441 */
442 list_for_each_entry(lprops, &c->uncat_list, list) {
443 if (lprops->flags & LPROPS_TAKEN)
444 continue;
445 if (lprops->flags & LPROPS_INDEX)
446 continue;
447 if (lprops->free >= min_space)
448 return lprops;
449 }
450 /* We have looked everywhere in main memory, now scan the flash */
451 if (c->pnodes_have >= c->pnode_cnt)
452 /* All pnodes are in memory, so skip scan */
453 return ERR_PTR(-ENOSPC);
454 data.min_space = min_space;
455 data.pick_free = pick_free;
456 data.lnum = -1;
457 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
458 (ubifs_lpt_scan_callback)scan_for_free_cb,
459 &data);
460 if (err)
461 return ERR_PTR(err);
462 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
463 c->lscan_lnum = data.lnum;
464 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
465 if (IS_ERR(lprops))
466 return lprops;
467 ubifs_assert(lprops->lnum == data.lnum);
468 ubifs_assert(lprops->free >= min_space);
469 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
470 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
471 return lprops;
472}
473
474/**
475 * ubifs_find_free_space - find a data LEB with free space.
476 * @c: the UBIFS file-system description object
477 * @min_space: minimum amount of required free space
478 * @free: contains amount of free space in the LEB on exit
479 * @squeeze: whether to try to find space in a non-empty LEB first
480 *
481 * This function looks for an LEB with at least @min_space bytes of free space.
482 * It tries to find an empty LEB if possible. If no empty LEBs are available,
483 * this function searches for a non-empty data LEB. The returned LEB is marked
484 * as "taken".
485 *
486 * This function returns found LEB number in case of success, %-ENOSPC if it
487 * failed to find a LEB with @min_space bytes of free space and other a negative
488 * error codes in case of failure.
489 */
490int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
491 int squeeze)
492{
493 const struct ubifs_lprops *lprops;
494 int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
495
496 dbg_find("min_space %d", min_space);
497 ubifs_get_lprops(c);
498
499 /* Check if there are enough empty LEBs for commit */
500 spin_lock(&c->space_lock);
501 if (c->min_idx_lebs > c->lst.idx_lebs)
502 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
503 else
504 rsvd_idx_lebs = 0;
505 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
506 c->lst.taken_empty_lebs;
507 ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
508 if (rsvd_idx_lebs < lebs)
509 /*
510 * OK to allocate an empty LEB, but we still don't want to go
511 * looking for one if there aren't any.
512 */
513 if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
514 pick_free = 1;
515 /*
516 * Because we release the space lock, we must account
517 * for this allocation here. After the LEB properties
518 * flags have been updated, we subtract one. Note, the
519 * result of this is that lprops also decreases
520 * @taken_empty_lebs in 'ubifs_change_lp()', so it is
521 * off by one for a short period of time which may
522 * introduce a small disturbance to budgeting
523 * calculations, but this is harmless because at the
524 * worst case this would make the budgeting subsystem
525 * be more pessimistic than needed.
526 *
527 * Fundamentally, this is about serialization of the
528 * budgeting and lprops subsystems. We could make the
529 * @space_lock a mutex and avoid dropping it before
530 * calling 'ubifs_change_lp()', but mutex is more
531 * heavy-weight, and we want budgeting to be as fast as
532 * possible.
533 */
534 c->lst.taken_empty_lebs += 1;
535 }
536 spin_unlock(&c->space_lock);
537
538 lprops = do_find_free_space(c, min_space, pick_free, squeeze);
539 if (IS_ERR(lprops)) {
540 err = PTR_ERR(lprops);
541 goto out;
542 }
543
544 lnum = lprops->lnum;
545 flags = lprops->flags | LPROPS_TAKEN;
546
547 lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
548 if (IS_ERR(lprops)) {
549 err = PTR_ERR(lprops);
550 goto out;
551 }
552
553 if (pick_free) {
554 spin_lock(&c->space_lock);
555 c->lst.taken_empty_lebs -= 1;
556 spin_unlock(&c->space_lock);
557 }
558
559 *free = lprops->free;
560 ubifs_release_lprops(c);
561
562 if (*free == c->leb_size) {
563 /*
564 * Ensure that empty LEBs have been unmapped. They may not have
565 * been, for example, because of an unclean unmount. Also
566 * LEBs that were freeable LEBs (free + dirty == leb_size) will
567 * not have been unmapped.
568 */
569 err = ubifs_leb_unmap(c, lnum);
570 if (err)
571 return err;
572 }
573
574 dbg_find("found LEB %d, free %d", lnum, *free);
575 ubifs_assert(*free >= min_space);
576 return lnum;
577
578out:
579 if (pick_free) {
580 spin_lock(&c->space_lock);
581 c->lst.taken_empty_lebs -= 1;
582 spin_unlock(&c->space_lock);
583 }
584 ubifs_release_lprops(c);
585 return err;
586}
587
588/**
589 * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
590 * @c: the UBIFS file-system description object
591 * @lprops: LEB properties to scan
592 * @in_tree: whether the LEB properties are in main memory
593 * @data: information passed to and from the caller of the scan
594 *
595 * This function returns a code that indicates whether the scan should continue
596 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
597 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
598 * (%LPT_SCAN_STOP).
599 */
600static int scan_for_idx_cb(struct ubifs_info *c,
601 const struct ubifs_lprops *lprops, int in_tree,
602 struct scan_data *data)
603{
604 int ret = LPT_SCAN_CONTINUE;
605
606 /* Exclude LEBs that are currently in use */
607 if (lprops->flags & LPROPS_TAKEN)
608 return LPT_SCAN_CONTINUE;
609 /* Determine whether to add these LEB properties to the tree */
610 if (!in_tree && valuable(c, lprops))
611 ret |= LPT_SCAN_ADD;
612 /* Exclude index LEBS */
613 if (lprops->flags & LPROPS_INDEX)
614 return ret;
615 /* Exclude LEBs that cannot be made empty */
616 if (lprops->free + lprops->dirty != c->leb_size)
617 return ret;
618 /*
619 * We are allocating for the index so it is safe to allocate LEBs with
620 * only free and dirty space, because write buffers are sync'd at commit
621 * start.
622 */
623 data->lnum = lprops->lnum;
624 return LPT_SCAN_ADD | LPT_SCAN_STOP;
625}
626
627/**
628 * scan_for_leb_for_idx - scan for a free LEB for the index.
629 * @c: the UBIFS file-system description object
630 */
631static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
632{
633 struct ubifs_lprops *lprops;
634 struct scan_data data;
635 int err;
636
637 data.lnum = -1;
638 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
639 (ubifs_lpt_scan_callback)scan_for_idx_cb,
640 &data);
641 if (err)
642 return ERR_PTR(err);
643 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
644 c->lscan_lnum = data.lnum;
645 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
646 if (IS_ERR(lprops))
647 return lprops;
648 ubifs_assert(lprops->lnum == data.lnum);
649 ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
650 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
651 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
652 return lprops;
653}
654
655/**
656 * ubifs_find_free_leb_for_idx - find a free LEB for the index.
657 * @c: the UBIFS file-system description object
658 *
659 * This function looks for a free LEB and returns that LEB number. The returned
660 * LEB is marked as "taken", "index".
661 *
662 * Only empty LEBs are allocated. This is for two reasons. First, the commit
663 * calculates the number of LEBs to allocate based on the assumption that they
664 * will be empty. Secondly, free space at the end of an index LEB is not
665 * guaranteed to be empty because it may have been used by the in-the-gaps
666 * method prior to an unclean unmount.
667 *
668 * If no LEB is found %-ENOSPC is returned. For other failures another negative
669 * error code is returned.
670 */
671int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
672{
673 const struct ubifs_lprops *lprops;
674 int lnum = -1, err, flags;
675
676 ubifs_get_lprops(c);
677
678 lprops = ubifs_fast_find_empty(c);
679 if (!lprops) {
680 lprops = ubifs_fast_find_freeable(c);
681 if (!lprops) {
682 ubifs_assert(c->freeable_cnt == 0);
683 if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
684 lprops = scan_for_leb_for_idx(c);
685 if (IS_ERR(lprops)) {
686 err = PTR_ERR(lprops);
687 goto out;
688 }
689 }
690 }
691 }
692
693 if (!lprops) {
694 err = -ENOSPC;
695 goto out;
696 }
697
698 lnum = lprops->lnum;
699
700 dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
701 lnum, lprops->free, lprops->dirty, lprops->flags);
702
703 flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
704 lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
705 if (IS_ERR(lprops)) {
706 err = PTR_ERR(lprops);
707 goto out;
708 }
709
710 ubifs_release_lprops(c);
711
712 /*
713 * Ensure that empty LEBs have been unmapped. They may not have been,
714 * for example, because of an unclean unmount. Also LEBs that were
715 * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
716 */
717 err = ubifs_leb_unmap(c, lnum);
718 if (err) {
719 ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
720 LPROPS_TAKEN | LPROPS_INDEX, 0);
721 return err;
722 }
723
724 return lnum;
725
726out:
727 ubifs_release_lprops(c);
728 return err;
729}
730
731static int cmp_dirty_idx(const struct ubifs_lprops **a,
732 const struct ubifs_lprops **b)
733{
734 const struct ubifs_lprops *lpa = *a;
735 const struct ubifs_lprops *lpb = *b;
736
737 return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
738}
739
740static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
741 int size)
742{
743 struct ubifs_lprops *t = *a;
744
745 *a = *b;
746 *b = t;
747}
748
749/**
750 * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
751 * @c: the UBIFS file-system description object
752 *
753 * This function is called each commit to create an array of LEB numbers of
754 * dirty index LEBs sorted in order of dirty and free space. This is used by
755 * the in-the-gaps method of TNC commit.
756 */
757int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
758{
759 int i;
760
761 ubifs_get_lprops(c);
762 /* Copy the LPROPS_DIRTY_IDX heap */
763 c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
764 memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
765 sizeof(void *) * c->dirty_idx.cnt);
766 /* Sort it so that the dirtiest is now at the end */
767 sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
768 (int (*)(const void *, const void *))cmp_dirty_idx,
769 (void (*)(void *, void *, int))swap_dirty_idx);
770 dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
771 if (c->dirty_idx.cnt)
772 dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
773 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
774 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
775 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
776 /* Replace the lprops pointers with LEB numbers */
777 for (i = 0; i < c->dirty_idx.cnt; i++)
778 c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
779 ubifs_release_lprops(c);
780 return 0;
781}
782
783/**
784 * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
785 * @c: the UBIFS file-system description object
786 * @lprops: LEB properties to scan
787 * @in_tree: whether the LEB properties are in main memory
788 * @data: information passed to and from the caller of the scan
789 *
790 * This function returns a code that indicates whether the scan should continue
791 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
792 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
793 * (%LPT_SCAN_STOP).
794 */
795static int scan_dirty_idx_cb(struct ubifs_info *c,
796 const struct ubifs_lprops *lprops, int in_tree,
797 struct scan_data *data)
798{
799 int ret = LPT_SCAN_CONTINUE;
800
801 /* Exclude LEBs that are currently in use */
802 if (lprops->flags & LPROPS_TAKEN)
803 return LPT_SCAN_CONTINUE;
804 /* Determine whether to add these LEB properties to the tree */
805 if (!in_tree && valuable(c, lprops))
806 ret |= LPT_SCAN_ADD;
807 /* Exclude non-index LEBs */
808 if (!(lprops->flags & LPROPS_INDEX))
809 return ret;
810 /* Exclude LEBs with too little space */
811 if (lprops->free + lprops->dirty < c->min_idx_node_sz)
812 return ret;
813 /* Finally we found space */
814 data->lnum = lprops->lnum;
815 return LPT_SCAN_ADD | LPT_SCAN_STOP;
816}
817
818/**
819 * find_dirty_idx_leb - find a dirty index LEB.
820 * @c: the UBIFS file-system description object
821 *
822 * This function returns LEB number upon success and a negative error code upon
823 * failure. In particular, -ENOSPC is returned if a dirty index LEB is not
824 * found.
825 *
826 * Note that this function scans the entire LPT but it is called very rarely.
827 */
828static int find_dirty_idx_leb(struct ubifs_info *c)
829{
830 const struct ubifs_lprops *lprops;
831 struct ubifs_lpt_heap *heap;
832 struct scan_data data;
833 int err, i, ret;
834
835 /* Check all structures in memory first */
836 data.lnum = -1;
837 heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
838 for (i = 0; i < heap->cnt; i++) {
839 lprops = heap->arr[i];
840 ret = scan_dirty_idx_cb(c, lprops, 1, &data);
841 if (ret & LPT_SCAN_STOP)
842 goto found;
843 }
844 list_for_each_entry(lprops, &c->frdi_idx_list, list) {
845 ret = scan_dirty_idx_cb(c, lprops, 1, &data);
846 if (ret & LPT_SCAN_STOP)
847 goto found;
848 }
849 list_for_each_entry(lprops, &c->uncat_list, list) {
850 ret = scan_dirty_idx_cb(c, lprops, 1, &data);
851 if (ret & LPT_SCAN_STOP)
852 goto found;
853 }
854 if (c->pnodes_have >= c->pnode_cnt)
855 /* All pnodes are in memory, so skip scan */
856 return -ENOSPC;
857 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
858 (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
859 &data);
860 if (err)
861 return err;
862found:
863 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
864 c->lscan_lnum = data.lnum;
865 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
866 if (IS_ERR(lprops))
867 return PTR_ERR(lprops);
868 ubifs_assert(lprops->lnum == data.lnum);
869 ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
870 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
871 ubifs_assert((lprops->flags & LPROPS_INDEX));
872
873 dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
874 lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
875
876 lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
877 lprops->flags | LPROPS_TAKEN, 0);
878 if (IS_ERR(lprops))
879 return PTR_ERR(lprops);
880
881 return lprops->lnum;
882}
883
884/**
885 * get_idx_gc_leb - try to get a LEB number from trivial GC.
886 * @c: the UBIFS file-system description object
887 */
888static int get_idx_gc_leb(struct ubifs_info *c)
889{
890 const struct ubifs_lprops *lp;
891 int err, lnum;
892
893 err = ubifs_get_idx_gc_leb(c);
894 if (err < 0)
895 return err;
896 lnum = err;
897 /*
898 * The LEB was due to be unmapped after the commit but
899 * it is needed now for this commit.
900 */
901 lp = ubifs_lpt_lookup_dirty(c, lnum);
902 if (unlikely(IS_ERR(lp)))
903 return PTR_ERR(lp);
904 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
905 lp->flags | LPROPS_INDEX, -1);
906 if (unlikely(IS_ERR(lp)))
907 return PTR_ERR(lp);
908 dbg_find("LEB %d, dirty %d and free %d flags %#x",
909 lp->lnum, lp->dirty, lp->free, lp->flags);
910 return lnum;
911}
912
913/**
914 * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
915 * @c: the UBIFS file-system description object
916 */
917static int find_dirtiest_idx_leb(struct ubifs_info *c)
918{
919 const struct ubifs_lprops *lp;
920 int lnum;
921
922 while (1) {
923 if (!c->dirty_idx.cnt)
924 return -ENOSPC;
925 /* The lprops pointers were replaced by LEB numbers */
926 lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
927 lp = ubifs_lpt_lookup(c, lnum);
928 if (IS_ERR(lp))
929 return PTR_ERR(lp);
930 if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
931 continue;
932 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
933 lp->flags | LPROPS_TAKEN, 0);
934 if (IS_ERR(lp))
935 return PTR_ERR(lp);
936 break;
937 }
938 dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
939 lp->free, lp->flags);
940 ubifs_assert(lp->flags | LPROPS_TAKEN);
941 ubifs_assert(lp->flags | LPROPS_INDEX);
942 return lnum;
943}
944
945/**
946 * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
947 * @c: the UBIFS file-system description object
948 *
949 * This function attempts to find an untaken index LEB with the most free and
950 * dirty space that can be used without overwriting index nodes that were in the
951 * last index committed.
952 */
953int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
954{
955 int err;
956
957 ubifs_get_lprops(c);
958
959 /*
960 * We made an array of the dirtiest index LEB numbers as at the start of
961 * last commit. Try that array first.
962 */
963 err = find_dirtiest_idx_leb(c);
964
965 /* Next try scanning the entire LPT */
966 if (err == -ENOSPC)
967 err = find_dirty_idx_leb(c);
968
969 /* Finally take any index LEBs awaiting trivial GC */
970 if (err == -ENOSPC)
971 err = get_idx_gc_leb(c);
972
973 ubifs_release_lprops(c);
974 return err;
975}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
new file mode 100644
index 000000000000..d0f3dac29081
--- /dev/null
+++ b/fs/ubifs/gc.c
@@ -0,0 +1,773 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements garbage collection. The procedure for garbage collection
25 * is different depending on whether a LEB as an index LEB (contains index
26 * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
27 * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
28 * nodes to the journal, at which point the garbage-collected LEB is free to be
29 * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
30 * dirty in the TNC, and after the next commit, the garbage-collected LEB is
31 * to be reused. Garbage collection will cause the number of dirty index nodes
32 * to grow, however sufficient space is reserved for the index to ensure the
33 * commit will never run out of space.
34 */
35
36#include <linux/pagemap.h>
37#include "ubifs.h"
38
39/*
40 * GC tries to optimize the way it fit nodes to available space, and it sorts
41 * nodes a little. The below constants are watermarks which define "large",
42 * "medium", and "small" nodes.
43 */
44#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
45#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
46
47/*
48 * GC may need to move more then one LEB to make progress. The below constants
49 * define "soft" and "hard" limits on the number of LEBs the garbage collector
50 * may move.
51 */
52#define SOFT_LEBS_LIMIT 4
53#define HARD_LEBS_LIMIT 32
54
55/**
56 * switch_gc_head - switch the garbage collection journal head.
57 * @c: UBIFS file-system description object
58 * @buf: buffer to write
59 * @len: length of the buffer to write
60 * @lnum: LEB number written is returned here
61 * @offs: offset written is returned here
62 *
63 * This function switch the GC head to the next LEB which is reserved in
64 * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
65 * and other negative error code in case of failures.
66 */
67static int switch_gc_head(struct ubifs_info *c)
68{
69 int err, gc_lnum = c->gc_lnum;
70 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
71
72 ubifs_assert(gc_lnum != -1);
73 dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
74 wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
75 c->leb_size - wbuf->offs - wbuf->used);
76
77 err = ubifs_wbuf_sync_nolock(wbuf);
78 if (err)
79 return err;
80
81 /*
82 * The GC write-buffer was synchronized, we may safely unmap
83 * 'c->gc_lnum'.
84 */
85 err = ubifs_leb_unmap(c, gc_lnum);
86 if (err)
87 return err;
88
89 err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
90 if (err)
91 return err;
92
93 c->gc_lnum = -1;
94 err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
95 return err;
96}
97
98/**
99 * move_nodes - move nodes.
100 * @c: UBIFS file-system description object
101 * @sleb: describes nodes to move
102 *
103 * This function moves valid nodes from data LEB described by @sleb to the GC
104 * journal head. The obsolete nodes are dropped.
105 *
106 * When moving nodes we have to deal with classical bin-packing problem: the
107 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
108 * where the nodes in the @sleb->nodes list are the elements which should be
109 * fit optimally to the bins. This function uses the "first fit decreasing"
110 * strategy, although it does not really sort the nodes but just split them on
111 * 3 classes - large, medium, and small, so they are roughly sorted.
112 *
113 * This function returns zero in case of success, %-EAGAIN if commit is
114 * required, and other negative error codes in case of other failures.
115 */
116static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
117{
118 struct ubifs_scan_node *snod, *tmp;
119 struct list_head large, medium, small;
120 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
121 int avail, err, min = INT_MAX;
122
123 INIT_LIST_HEAD(&large);
124 INIT_LIST_HEAD(&medium);
125 INIT_LIST_HEAD(&small);
126
127 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
128 struct list_head *lst;
129
130 ubifs_assert(snod->type != UBIFS_IDX_NODE);
131 ubifs_assert(snod->type != UBIFS_REF_NODE);
132 ubifs_assert(snod->type != UBIFS_CS_NODE);
133
134 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
135 snod->offs, 0);
136 if (err < 0)
137 goto out;
138
139 lst = &snod->list;
140 list_del(lst);
141 if (!err) {
142 /* The node is obsolete, remove it from the list */
143 kfree(snod);
144 continue;
145 }
146
147 /*
148 * Sort the list of nodes so that large nodes go first, and
149 * small nodes go last.
150 */
151 if (snod->len > MEDIUM_NODE_WM)
152 list_add(lst, &large);
153 else if (snod->len > SMALL_NODE_WM)
154 list_add(lst, &medium);
155 else
156 list_add(lst, &small);
157
158 /* And find the smallest node */
159 if (snod->len < min)
160 min = snod->len;
161 }
162
163 /*
164 * Join the tree lists so that we'd have one roughly sorted list
165 * ('large' will be the head of the joined list).
166 */
167 list_splice(&medium, large.prev);
168 list_splice(&small, large.prev);
169
170 if (wbuf->lnum == -1) {
171 /*
172 * The GC journal head is not set, because it is the first GC
173 * invocation since mount.
174 */
175 err = switch_gc_head(c);
176 if (err)
177 goto out;
178 }
179
180 /* Write nodes to their new location. Use the first-fit strategy */
181 while (1) {
182 avail = c->leb_size - wbuf->offs - wbuf->used;
183 list_for_each_entry_safe(snod, tmp, &large, list) {
184 int new_lnum, new_offs;
185
186 if (avail < min)
187 break;
188
189 if (snod->len > avail)
190 /* This node does not fit */
191 continue;
192
193 cond_resched();
194
195 new_lnum = wbuf->lnum;
196 new_offs = wbuf->offs + wbuf->used;
197 err = ubifs_wbuf_write_nolock(wbuf, snod->node,
198 snod->len);
199 if (err)
200 goto out;
201 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
202 snod->offs, new_lnum, new_offs,
203 snod->len);
204 if (err)
205 goto out;
206
207 avail = c->leb_size - wbuf->offs - wbuf->used;
208 list_del(&snod->list);
209 kfree(snod);
210 }
211
212 if (list_empty(&large))
213 break;
214
215 /*
216 * Waste the rest of the space in the LEB and switch to the
217 * next LEB.
218 */
219 err = switch_gc_head(c);
220 if (err)
221 goto out;
222 }
223
224 return 0;
225
226out:
227 list_for_each_entry_safe(snod, tmp, &large, list) {
228 list_del(&snod->list);
229 kfree(snod);
230 }
231 return err;
232}
233
234/**
235 * gc_sync_wbufs - sync write-buffers for GC.
236 * @c: UBIFS file-system description object
237 *
238 * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
239 * be in a write-buffer instead. That is, a node could be written to a
240 * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
241 * erased before the write-buffer is sync'd and then there is an unclean
242 * unmount, then an existing node is lost. To avoid this, we sync all
243 * write-buffers.
244 *
245 * This function returns %0 on success or a negative error code on failure.
246 */
247static int gc_sync_wbufs(struct ubifs_info *c)
248{
249 int err, i;
250
251 for (i = 0; i < c->jhead_cnt; i++) {
252 if (i == GCHD)
253 continue;
254 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
255 if (err)
256 return err;
257 }
258 return 0;
259}
260
261/**
262 * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
263 * @c: UBIFS file-system description object
264 * @lp: describes the LEB to garbage collect
265 *
266 * This function garbage-collects an LEB and returns one of the @LEB_FREED,
267 * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
268 * required, and other negative error codes in case of failures.
269 */
270int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
271{
272 struct ubifs_scan_leb *sleb;
273 struct ubifs_scan_node *snod;
274 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
275 int err = 0, lnum = lp->lnum;
276
277 ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
278 c->need_recovery);
279 ubifs_assert(c->gc_lnum != lnum);
280 ubifs_assert(wbuf->lnum != lnum);
281
282 /*
283 * We scan the entire LEB even though we only really need to scan up to
284 * (c->leb_size - lp->free).
285 */
286 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
287 if (IS_ERR(sleb))
288 return PTR_ERR(sleb);
289
290 ubifs_assert(!list_empty(&sleb->nodes));
291 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
292
293 if (snod->type == UBIFS_IDX_NODE) {
294 struct ubifs_gced_idx_leb *idx_gc;
295
296 dbg_gc("indexing LEB %d (free %d, dirty %d)",
297 lnum, lp->free, lp->dirty);
298 list_for_each_entry(snod, &sleb->nodes, list) {
299 struct ubifs_idx_node *idx = snod->node;
300 int level = le16_to_cpu(idx->level);
301
302 ubifs_assert(snod->type == UBIFS_IDX_NODE);
303 key_read(c, ubifs_idx_key(c, idx), &snod->key);
304 err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
305 snod->offs);
306 if (err)
307 goto out;
308 }
309
310 idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
311 if (!idx_gc) {
312 err = -ENOMEM;
313 goto out;
314 }
315
316 idx_gc->lnum = lnum;
317 idx_gc->unmap = 0;
318 list_add(&idx_gc->list, &c->idx_gc);
319
320 /*
321 * Don't release the LEB until after the next commit, because
322 * it may contain date which is needed for recovery. So
323 * although we freed this LEB, it will become usable only after
324 * the commit.
325 */
326 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
327 LPROPS_INDEX, 1);
328 if (err)
329 goto out;
330 err = LEB_FREED_IDX;
331 } else {
332 dbg_gc("data LEB %d (free %d, dirty %d)",
333 lnum, lp->free, lp->dirty);
334
335 err = move_nodes(c, sleb);
336 if (err)
337 goto out;
338
339 err = gc_sync_wbufs(c);
340 if (err)
341 goto out;
342
343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
344 if (err)
345 goto out;
346
347 if (c->gc_lnum == -1) {
348 c->gc_lnum = lnum;
349 err = LEB_RETAINED;
350 } else {
351 err = ubifs_wbuf_sync_nolock(wbuf);
352 if (err)
353 goto out;
354
355 err = ubifs_leb_unmap(c, lnum);
356 if (err)
357 goto out;
358
359 err = LEB_FREED;
360 }
361 }
362
363out:
364 ubifs_scan_destroy(sleb);
365 return err;
366}
367
368/**
369 * ubifs_garbage_collect - UBIFS garbage collector.
370 * @c: UBIFS file-system description object
371 * @anyway: do GC even if there are free LEBs
372 *
373 * This function does out-of-place garbage collection. The return codes are:
374 * o positive LEB number if the LEB has been freed and may be used;
375 * o %-EAGAIN if the caller has to run commit;
376 * o %-ENOSPC if GC failed to make any progress;
377 * o other negative error codes in case of other errors.
378 *
379 * Garbage collector writes data to the journal when GC'ing data LEBs, and just
380 * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
381 * commit may be required. But commit cannot be run from inside GC, because the
382 * caller might be holding the commit lock, so %-EAGAIN is returned instead;
383 * And this error code means that the caller has to run commit, and re-run GC
384 * if there is still no free space.
385 *
386 * There are many reasons why this function may return %-EAGAIN:
387 * o the log is full and there is no space to write an LEB reference for
388 * @c->gc_lnum;
389 * o the journal is too large and exceeds size limitations;
390 * o GC moved indexing LEBs, but they can be used only after the commit;
391 * o the shrinker fails to find clean znodes to free and requests the commit;
392 * o etc.
393 *
394 * Note, if the file-system is close to be full, this function may return
395 * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
396 * the function. E.g., this happens if the limits on the journal size are too
397 * tough and GC writes too much to the journal before an LEB is freed. This
398 * might also mean that the journal is too large, and the TNC becomes to big,
399 * so that the shrinker is constantly called, finds not clean znodes to free,
400 * and requests commit. Well, this may also happen if the journal is all right,
401 * but another kernel process consumes too much memory. Anyway, infinite
402 * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
403 */
404int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
405{
406 int i, err, ret, min_space = c->dead_wm;
407 struct ubifs_lprops lp;
408 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
409
410 ubifs_assert_cmt_locked(c);
411
412 if (ubifs_gc_should_commit(c))
413 return -EAGAIN;
414
415 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
416
417 if (c->ro_media) {
418 ret = -EROFS;
419 goto out_unlock;
420 }
421
422 /* We expect the write-buffer to be empty on entry */
423 ubifs_assert(!wbuf->used);
424
425 for (i = 0; ; i++) {
426 int space_before = c->leb_size - wbuf->offs - wbuf->used;
427 int space_after;
428
429 cond_resched();
430
431 /* Give the commit an opportunity to run */
432 if (ubifs_gc_should_commit(c)) {
433 ret = -EAGAIN;
434 break;
435 }
436
437 if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
438 /*
439 * We've done enough iterations. Indexing LEBs were
440 * moved and will be available after the commit.
441 */
442 dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
443 ubifs_commit_required(c);
444 ret = -EAGAIN;
445 break;
446 }
447
448 if (i > HARD_LEBS_LIMIT) {
449 /*
450 * We've moved too many LEBs and have not made
451 * progress, give up.
452 */
453 dbg_gc("hard limit, -ENOSPC");
454 ret = -ENOSPC;
455 break;
456 }
457
458 /*
459 * Empty and freeable LEBs can turn up while we waited for
460 * the wbuf lock, or while we have been running GC. In that
461 * case, we should just return one of those instead of
462 * continuing to GC dirty LEBs. Hence we request
463 * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
464 */
465 ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
466 if (ret) {
467 if (ret == -ENOSPC)
468 dbg_gc("no more dirty LEBs");
469 break;
470 }
471
472 dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
473 "(min. space %d)", lp.lnum, lp.free, lp.dirty,
474 lp.free + lp.dirty, min_space);
475
476 if (lp.free + lp.dirty == c->leb_size) {
477 /* An empty LEB was returned */
478 dbg_gc("LEB %d is free, return it", lp.lnum);
479 /*
480 * ubifs_find_dirty_leb() doesn't return freeable index
481 * LEBs.
482 */
483 ubifs_assert(!(lp.flags & LPROPS_INDEX));
484 if (lp.free != c->leb_size) {
485 /*
486 * Write buffers must be sync'd before
487 * unmapping freeable LEBs, because one of them
488 * may contain data which obsoletes something
489 * in 'lp.pnum'.
490 */
491 ret = gc_sync_wbufs(c);
492 if (ret)
493 goto out;
494 ret = ubifs_change_one_lp(c, lp.lnum,
495 c->leb_size, 0, 0, 0,
496 0);
497 if (ret)
498 goto out;
499 }
500 ret = ubifs_leb_unmap(c, lp.lnum);
501 if (ret)
502 goto out;
503 ret = lp.lnum;
504 break;
505 }
506
507 space_before = c->leb_size - wbuf->offs - wbuf->used;
508 if (wbuf->lnum == -1)
509 space_before = 0;
510
511 ret = ubifs_garbage_collect_leb(c, &lp);
512 if (ret < 0) {
513 if (ret == -EAGAIN || ret == -ENOSPC) {
514 /*
515 * These codes are not errors, so we have to
516 * return the LEB to lprops. But if the
517 * 'ubifs_return_leb()' function fails, its
518 * failure code is propagated to the caller
519 * instead of the original '-EAGAIN' or
520 * '-ENOSPC'.
521 */
522 err = ubifs_return_leb(c, lp.lnum);
523 if (err)
524 ret = err;
525 break;
526 }
527 goto out;
528 }
529
530 if (ret == LEB_FREED) {
531 /* An LEB has been freed and is ready for use */
532 dbg_gc("LEB %d freed, return", lp.lnum);
533 ret = lp.lnum;
534 break;
535 }
536
537 if (ret == LEB_FREED_IDX) {
538 /*
539 * This was an indexing LEB and it cannot be
540 * immediately used. And instead of requesting the
541 * commit straight away, we try to garbage collect some
542 * more.
543 */
544 dbg_gc("indexing LEB %d freed, continue", lp.lnum);
545 continue;
546 }
547
548 ubifs_assert(ret == LEB_RETAINED);
549 space_after = c->leb_size - wbuf->offs - wbuf->used;
550 dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
551 space_after - space_before);
552
553 if (space_after > space_before) {
554 /* GC makes progress, keep working */
555 min_space >>= 1;
556 if (min_space < c->dead_wm)
557 min_space = c->dead_wm;
558 continue;
559 }
560
561 dbg_gc("did not make progress");
562
563 /*
564 * GC moved an LEB bud have not done any progress. This means
565 * that the previous GC head LEB contained too few free space
566 * and the LEB which was GC'ed contained only large nodes which
567 * did not fit that space.
568 *
569 * We can do 2 things:
570 * 1. pick another LEB in a hope it'll contain a small node
571 * which will fit the space we have at the end of current GC
572 * head LEB, but there is no guarantee, so we try this out
573 * unless we have already been working for too long;
574 * 2. request an LEB with more dirty space, which will force
575 * 'ubifs_find_dirty_leb()' to start scanning the lprops
576 * table, instead of just picking one from the heap
577 * (previously it already picked the dirtiest LEB).
578 */
579 if (i < SOFT_LEBS_LIMIT) {
580 dbg_gc("try again");
581 continue;
582 }
583
584 min_space <<= 1;
585 if (min_space > c->dark_wm)
586 min_space = c->dark_wm;
587 dbg_gc("set min. space to %d", min_space);
588 }
589
590 if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
591 dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
592 ubifs_commit_required(c);
593 ret = -EAGAIN;
594 }
595
596 err = ubifs_wbuf_sync_nolock(wbuf);
597 if (!err)
598 err = ubifs_leb_unmap(c, c->gc_lnum);
599 if (err) {
600 ret = err;
601 goto out;
602 }
603out_unlock:
604 mutex_unlock(&wbuf->io_mutex);
605 return ret;
606
607out:
608 ubifs_assert(ret < 0);
609 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
610 ubifs_ro_mode(c, ret);
611 ubifs_wbuf_sync_nolock(wbuf);
612 mutex_unlock(&wbuf->io_mutex);
613 ubifs_return_leb(c, lp.lnum);
614 return ret;
615}
616
617/**
618 * ubifs_gc_start_commit - garbage collection at start of commit.
619 * @c: UBIFS file-system description object
620 *
621 * If a LEB has only dirty and free space, then we may safely unmap it and make
622 * it free. Note, we cannot do this with indexing LEBs because dirty space may
623 * correspond index nodes that are required for recovery. In that case, the
624 * LEB cannot be unmapped until after the next commit.
625 *
626 * This function returns %0 upon success and a negative error code upon failure.
627 */
628int ubifs_gc_start_commit(struct ubifs_info *c)
629{
630 struct ubifs_gced_idx_leb *idx_gc;
631 const struct ubifs_lprops *lp;
632 int err = 0, flags;
633
634 ubifs_get_lprops(c);
635
636 /*
637 * Unmap (non-index) freeable LEBs. Note that recovery requires that all
638 * wbufs are sync'd before this, which is done in 'do_commit()'.
639 */
640 while (1) {
641 lp = ubifs_fast_find_freeable(c);
642 if (unlikely(IS_ERR(lp))) {
643 err = PTR_ERR(lp);
644 goto out;
645 }
646 if (!lp)
647 break;
648 ubifs_assert(!(lp->flags & LPROPS_TAKEN));
649 ubifs_assert(!(lp->flags & LPROPS_INDEX));
650 err = ubifs_leb_unmap(c, lp->lnum);
651 if (err)
652 goto out;
653 lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
654 if (unlikely(IS_ERR(lp))) {
655 err = PTR_ERR(lp);
656 goto out;
657 }
658 ubifs_assert(!(lp->flags & LPROPS_TAKEN));
659 ubifs_assert(!(lp->flags & LPROPS_INDEX));
660 }
661
662 /* Mark GC'd index LEBs OK to unmap after this commit finishes */
663 list_for_each_entry(idx_gc, &c->idx_gc, list)
664 idx_gc->unmap = 1;
665
666 /* Record index freeable LEBs for unmapping after commit */
667 while (1) {
668 lp = ubifs_fast_find_frdi_idx(c);
669 if (unlikely(IS_ERR(lp))) {
670 err = PTR_ERR(lp);
671 goto out;
672 }
673 if (!lp)
674 break;
675 idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
676 if (!idx_gc) {
677 err = -ENOMEM;
678 goto out;
679 }
680 ubifs_assert(!(lp->flags & LPROPS_TAKEN));
681 ubifs_assert(lp->flags & LPROPS_INDEX);
682 /* Don't release the LEB until after the next commit */
683 flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
684 lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
685 if (unlikely(IS_ERR(lp))) {
686 err = PTR_ERR(lp);
687 kfree(idx_gc);
688 goto out;
689 }
690 ubifs_assert(lp->flags & LPROPS_TAKEN);
691 ubifs_assert(!(lp->flags & LPROPS_INDEX));
692 idx_gc->lnum = lp->lnum;
693 idx_gc->unmap = 1;
694 list_add(&idx_gc->list, &c->idx_gc);
695 }
696out:
697 ubifs_release_lprops(c);
698 return err;
699}
700
701/**
702 * ubifs_gc_end_commit - garbage collection at end of commit.
703 * @c: UBIFS file-system description object
704 *
705 * This function completes out-of-place garbage collection of index LEBs.
706 */
707int ubifs_gc_end_commit(struct ubifs_info *c)
708{
709 struct ubifs_gced_idx_leb *idx_gc, *tmp;
710 struct ubifs_wbuf *wbuf;
711 int err = 0;
712
713 wbuf = &c->jheads[GCHD].wbuf;
714 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
715 list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
716 if (idx_gc->unmap) {
717 dbg_gc("LEB %d", idx_gc->lnum);
718 err = ubifs_leb_unmap(c, idx_gc->lnum);
719 if (err)
720 goto out;
721 err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
722 LPROPS_NC, 0, LPROPS_TAKEN, -1);
723 if (err)
724 goto out;
725 list_del(&idx_gc->list);
726 kfree(idx_gc);
727 }
728out:
729 mutex_unlock(&wbuf->io_mutex);
730 return err;
731}
732
733/**
734 * ubifs_destroy_idx_gc - destroy idx_gc list.
735 * @c: UBIFS file-system description object
736 *
737 * This function destroys the idx_gc list. It is called when unmounting or
738 * remounting read-only so locks are not needed.
739 */
740void ubifs_destroy_idx_gc(struct ubifs_info *c)
741{
742 while (!list_empty(&c->idx_gc)) {
743 struct ubifs_gced_idx_leb *idx_gc;
744
745 idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
746 list);
747 c->idx_gc_cnt -= 1;
748 list_del(&idx_gc->list);
749 kfree(idx_gc);
750 }
751
752}
753
754/**
755 * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
756 * @c: UBIFS file-system description object
757 *
758 * Called during start commit so locks are not needed.
759 */
760int ubifs_get_idx_gc_leb(struct ubifs_info *c)
761{
762 struct ubifs_gced_idx_leb *idx_gc;
763 int lnum;
764
765 if (list_empty(&c->idx_gc))
766 return -ENOSPC;
767 idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
768 lnum = idx_gc->lnum;
769 /* c->idx_gc_cnt is updated by the caller when lprops are updated */
770 list_del(&idx_gc->list);
771 kfree(idx_gc);
772 return lnum;
773}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
new file mode 100644
index 000000000000..3374f91b6709
--- /dev/null
+++ b/fs/ubifs/io.c
@@ -0,0 +1,914 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 * Copyright (C) 2006, 2007 University of Szeged, Hungary
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published by
9 * the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 51
18 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Authors: Artem Bityutskiy (Битюцкий Артём)
21 * Adrian Hunter
22 * Zoltan Sogor
23 */
24
25/*
26 * This file implements UBIFS I/O subsystem which provides various I/O-related
27 * helper functions (reading/writing/checking/validating nodes) and implements
28 * write-buffering support. Write buffers help to save space which otherwise
29 * would have been wasted for padding to the nearest minimal I/O unit boundary.
30 * Instead, data first goes to the write-buffer and is flushed when the
31 * buffer is full or when it is not used for some time (by timer). This is
32 * similarto the mechanism is used by JFFS2.
33 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code
36 * has to lock the write-buffer (e.g. journal space reservation code), many
37 * functions related to write-buffers have "nolock" suffix which means that the
38 * caller has to lock the write-buffer before calling this function.
39 *
40 * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
41 * aligned, UBIFS starts the next node from the aligned address, and the padded
42 * bytes may contain any rubbish. In other words, UBIFS does not put padding
43 * bytes in those small gaps. Common headers of nodes store real node lengths,
44 * not aligned lengths. Indexing nodes also store real lengths in branches.
45 *
46 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
47 * uses padding nodes or padding bytes, if the padding node does not fit.
48 *
49 * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
50 * every time they are read from the flash media.
51 */
52
53#include <linux/crc32.h>
54#include "ubifs.h"
55
56/**
57 * ubifs_check_node - check node.
58 * @c: UBIFS file-system description object
59 * @buf: node to check
60 * @lnum: logical eraseblock number
61 * @offs: offset within the logical eraseblock
62 * @quiet: print no messages
63 *
64 * This function checks node magic number and CRC checksum. This function also
65 * validates node length to prevent UBIFS from becoming crazy when an attacker
66 * feeds it a file-system image with incorrect nodes. For example, too large
67 * node length in the common header could cause UBIFS to read memory outside of
68 * allocated buffer when checking the CRC checksum.
69 *
70 * This function returns zero in case of success %-EUCLEAN in case of bad CRC
71 * or magic.
72 */
73int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
74 int offs, int quiet)
75{
76 int err = -EINVAL, type, node_len;
77 uint32_t crc, node_crc, magic;
78 const struct ubifs_ch *ch = buf;
79
80 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
81 ubifs_assert(!(offs & 7) && offs < c->leb_size);
82
83 magic = le32_to_cpu(ch->magic);
84 if (magic != UBIFS_NODE_MAGIC) {
85 if (!quiet)
86 ubifs_err("bad magic %#08x, expected %#08x",
87 magic, UBIFS_NODE_MAGIC);
88 err = -EUCLEAN;
89 goto out;
90 }
91
92 type = ch->node_type;
93 if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
94 if (!quiet)
95 ubifs_err("bad node type %d", type);
96 goto out;
97 }
98
99 node_len = le32_to_cpu(ch->len);
100 if (node_len + offs > c->leb_size)
101 goto out_len;
102
103 if (c->ranges[type].max_len == 0) {
104 if (node_len != c->ranges[type].len)
105 goto out_len;
106 } else if (node_len < c->ranges[type].min_len ||
107 node_len > c->ranges[type].max_len)
108 goto out_len;
109
110 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
111 node_crc = le32_to_cpu(ch->crc);
112 if (crc != node_crc) {
113 if (!quiet)
114 ubifs_err("bad CRC: calculated %#08x, read %#08x",
115 crc, node_crc);
116 err = -EUCLEAN;
117 goto out;
118 }
119
120 return 0;
121
122out_len:
123 if (!quiet)
124 ubifs_err("bad node length %d", node_len);
125out:
126 if (!quiet) {
127 ubifs_err("bad node at LEB %d:%d", lnum, offs);
128 dbg_dump_node(c, buf);
129 dbg_dump_stack();
130 }
131 return err;
132}
133
134/**
135 * ubifs_pad - pad flash space.
136 * @c: UBIFS file-system description object
137 * @buf: buffer to put padding to
138 * @pad: how many bytes to pad
139 *
140 * The flash media obliges us to write only in chunks of %c->min_io_size and
141 * when we have to write less data we add padding node to the write-buffer and
142 * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
143 * media is being scanned. If the amount of wasted space is not enough to fit a
144 * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
145 * pattern (%UBIFS_PADDING_BYTE).
146 *
147 * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
148 * used.
149 */
150void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
151{
152 uint32_t crc;
153
154 ubifs_assert(pad >= 0 && !(pad & 7));
155
156 if (pad >= UBIFS_PAD_NODE_SZ) {
157 struct ubifs_ch *ch = buf;
158 struct ubifs_pad_node *pad_node = buf;
159
160 ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
161 ch->node_type = UBIFS_PAD_NODE;
162 ch->group_type = UBIFS_NO_NODE_GROUP;
163 ch->padding[0] = ch->padding[1] = 0;
164 ch->sqnum = 0;
165 ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
166 pad -= UBIFS_PAD_NODE_SZ;
167 pad_node->pad_len = cpu_to_le32(pad);
168 crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
169 ch->crc = cpu_to_le32(crc);
170 memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
171 } else if (pad > 0)
172 /* Too little space, padding node won't fit */
173 memset(buf, UBIFS_PADDING_BYTE, pad);
174}
175
176/**
177 * next_sqnum - get next sequence number.
178 * @c: UBIFS file-system description object
179 */
180static unsigned long long next_sqnum(struct ubifs_info *c)
181{
182 unsigned long long sqnum;
183
184 spin_lock(&c->cnt_lock);
185 sqnum = ++c->max_sqnum;
186 spin_unlock(&c->cnt_lock);
187
188 if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
189 if (sqnum >= SQNUM_WATERMARK) {
190 ubifs_err("sequence number overflow %llu, end of life",
191 sqnum);
192 ubifs_ro_mode(c, -EINVAL);
193 }
194 ubifs_warn("running out of sequence numbers, end of life soon");
195 }
196
197 return sqnum;
198}
199
200/**
201 * ubifs_prepare_node - prepare node to be written to flash.
202 * @c: UBIFS file-system description object
203 * @node: the node to pad
204 * @len: node length
205 * @pad: if the buffer has to be padded
206 *
207 * This function prepares node at @node to be written to the media - it
208 * calculates node CRC, fills the common header, and adds proper padding up to
209 * the next minimum I/O unit if @pad is not zero.
210 */
211void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
212{
213 uint32_t crc;
214 struct ubifs_ch *ch = node;
215 unsigned long long sqnum = next_sqnum(c);
216
217 ubifs_assert(len >= UBIFS_CH_SZ);
218
219 ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
220 ch->len = cpu_to_le32(len);
221 ch->group_type = UBIFS_NO_NODE_GROUP;
222 ch->sqnum = cpu_to_le64(sqnum);
223 ch->padding[0] = ch->padding[1] = 0;
224 crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
225 ch->crc = cpu_to_le32(crc);
226
227 if (pad) {
228 len = ALIGN(len, 8);
229 pad = ALIGN(len, c->min_io_size) - len;
230 ubifs_pad(c, node + len, pad);
231 }
232}
233
234/**
235 * ubifs_prep_grp_node - prepare node of a group to be written to flash.
236 * @c: UBIFS file-system description object
237 * @node: the node to pad
238 * @len: node length
239 * @last: indicates the last node of the group
240 *
241 * This function prepares node at @node to be written to the media - it
242 * calculates node CRC and fills the common header.
243 */
244void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
245{
246 uint32_t crc;
247 struct ubifs_ch *ch = node;
248 unsigned long long sqnum = next_sqnum(c);
249
250 ubifs_assert(len >= UBIFS_CH_SZ);
251
252 ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
253 ch->len = cpu_to_le32(len);
254 if (last)
255 ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
256 else
257 ch->group_type = UBIFS_IN_NODE_GROUP;
258 ch->sqnum = cpu_to_le64(sqnum);
259 ch->padding[0] = ch->padding[1] = 0;
260 crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
261 ch->crc = cpu_to_le32(crc);
262}
263
264/**
265 * wbuf_timer_callback - write-buffer timer callback function.
266 * @data: timer data (write-buffer descriptor)
267 *
268 * This function is called when the write-buffer timer expires.
269 */
270static void wbuf_timer_callback_nolock(unsigned long data)
271{
272 struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
273
274 wbuf->need_sync = 1;
275 wbuf->c->need_wbuf_sync = 1;
276 ubifs_wake_up_bgt(wbuf->c);
277}
278
279/**
280 * new_wbuf_timer - start new write-buffer timer.
281 * @wbuf: write-buffer descriptor
282 */
283static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
284{
285 ubifs_assert(!timer_pending(&wbuf->timer));
286
287 if (!wbuf->timeout)
288 return;
289
290 wbuf->timer.expires = jiffies + wbuf->timeout;
291 add_timer(&wbuf->timer);
292}
293
294/**
295 * cancel_wbuf_timer - cancel write-buffer timer.
296 * @wbuf: write-buffer descriptor
297 */
298static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
299{
300 /*
301 * If the syncer is waiting for the lock (from the background thread's
302 * context) and another task is changing write-buffer then the syncing
303 * should be canceled.
304 */
305 wbuf->need_sync = 0;
306 del_timer(&wbuf->timer);
307}
308
309/**
310 * ubifs_wbuf_sync_nolock - synchronize write-buffer.
311 * @wbuf: write-buffer to synchronize
312 *
313 * This function synchronizes write-buffer @buf and returns zero in case of
314 * success or a negative error code in case of failure.
315 */
316int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
317{
318 struct ubifs_info *c = wbuf->c;
319 int err, dirt;
320
321 cancel_wbuf_timer_nolock(wbuf);
322 if (!wbuf->used || wbuf->lnum == -1)
323 /* Write-buffer is empty or not seeked */
324 return 0;
325
326 dbg_io("LEB %d:%d, %d bytes",
327 wbuf->lnum, wbuf->offs, wbuf->used);
328 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
329 ubifs_assert(!(wbuf->avail & 7));
330 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
331
332 if (c->ro_media)
333 return -EROFS;
334
335 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
336 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
337 c->min_io_size, wbuf->dtype);
338 if (err) {
339 ubifs_err("cannot write %d bytes to LEB %d:%d",
340 c->min_io_size, wbuf->lnum, wbuf->offs);
341 dbg_dump_stack();
342 return err;
343 }
344
345 dirt = wbuf->avail;
346
347 spin_lock(&wbuf->lock);
348 wbuf->offs += c->min_io_size;
349 wbuf->avail = c->min_io_size;
350 wbuf->used = 0;
351 wbuf->next_ino = 0;
352 spin_unlock(&wbuf->lock);
353
354 if (wbuf->sync_callback)
355 err = wbuf->sync_callback(c, wbuf->lnum,
356 c->leb_size - wbuf->offs, dirt);
357 return err;
358}
359
360/**
361 * ubifs_wbuf_seek_nolock - seek write-buffer.
362 * @wbuf: write-buffer
363 * @lnum: logical eraseblock number to seek to
364 * @offs: logical eraseblock offset to seek to
365 * @dtype: data type
366 *
367 * This function targets the write buffer to logical eraseblock @lnum:@offs.
368 * The write-buffer is synchronized if it is not empty. Returns zero in case of
369 * success and a negative error code in case of failure.
370 */
371int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
372 int dtype)
373{
374 const struct ubifs_info *c = wbuf->c;
375
376 dbg_io("LEB %d:%d", lnum, offs);
377 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
378 ubifs_assert(offs >= 0 && offs <= c->leb_size);
379 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
380 ubifs_assert(lnum != wbuf->lnum);
381
382 if (wbuf->used > 0) {
383 int err = ubifs_wbuf_sync_nolock(wbuf);
384
385 if (err)
386 return err;
387 }
388
389 spin_lock(&wbuf->lock);
390 wbuf->lnum = lnum;
391 wbuf->offs = offs;
392 wbuf->avail = c->min_io_size;
393 wbuf->used = 0;
394 spin_unlock(&wbuf->lock);
395 wbuf->dtype = dtype;
396
397 return 0;
398}
399
400/**
401 * ubifs_bg_wbufs_sync - synchronize write-buffers.
402 * @c: UBIFS file-system description object
403 *
404 * This function is called by background thread to synchronize write-buffers.
405 * Returns zero in case of success and a negative error code in case of
406 * failure.
407 */
408int ubifs_bg_wbufs_sync(struct ubifs_info *c)
409{
410 int err, i;
411
412 if (!c->need_wbuf_sync)
413 return 0;
414 c->need_wbuf_sync = 0;
415
416 if (c->ro_media) {
417 err = -EROFS;
418 goto out_timers;
419 }
420
421 dbg_io("synchronize");
422 for (i = 0; i < c->jhead_cnt; i++) {
423 struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
424
425 cond_resched();
426
427 /*
428 * If the mutex is locked then wbuf is being changed, so
429 * synchronization is not necessary.
430 */
431 if (mutex_is_locked(&wbuf->io_mutex))
432 continue;
433
434 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
435 if (!wbuf->need_sync) {
436 mutex_unlock(&wbuf->io_mutex);
437 continue;
438 }
439
440 err = ubifs_wbuf_sync_nolock(wbuf);
441 mutex_unlock(&wbuf->io_mutex);
442 if (err) {
443 ubifs_err("cannot sync write-buffer, error %d", err);
444 ubifs_ro_mode(c, err);
445 goto out_timers;
446 }
447 }
448
449 return 0;
450
451out_timers:
452 /* Cancel all timers to prevent repeated errors */
453 for (i = 0; i < c->jhead_cnt; i++) {
454 struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
455
456 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
457 cancel_wbuf_timer_nolock(wbuf);
458 mutex_unlock(&wbuf->io_mutex);
459 }
460 return err;
461}
462
463/**
464 * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
465 * @wbuf: write-buffer
466 * @buf: node to write
467 * @len: node length
468 *
469 * This function writes data to flash via write-buffer @wbuf. This means that
470 * the last piece of the node won't reach the flash media immediately if it
471 * does not take whole minimal I/O unit. Instead, the node will sit in RAM
472 * until the write-buffer is synchronized (e.g., by timer).
473 *
474 * This function returns zero in case of success and a negative error code in
475 * case of failure. If the node cannot be written because there is no more
476 * space in this logical eraseblock, %-ENOSPC is returned.
477 */
478int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
479{
480 struct ubifs_info *c = wbuf->c;
481 int err, written, n, aligned_len = ALIGN(len, 8), offs;
482
483 dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
484 dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
485 wbuf->offs + wbuf->used);
486 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
487 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
488 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
489 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
490 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
491
492 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
493 err = -ENOSPC;
494 goto out;
495 }
496
497 cancel_wbuf_timer_nolock(wbuf);
498
499 if (c->ro_media)
500 return -EROFS;
501
502 if (aligned_len <= wbuf->avail) {
503 /*
504 * The node is not very large and fits entirely within
505 * write-buffer.
506 */
507 memcpy(wbuf->buf + wbuf->used, buf, len);
508
509 if (aligned_len == wbuf->avail) {
510 dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
511 wbuf->offs);
512 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
513 wbuf->offs, c->min_io_size,
514 wbuf->dtype);
515 if (err)
516 goto out;
517
518 spin_lock(&wbuf->lock);
519 wbuf->offs += c->min_io_size;
520 wbuf->avail = c->min_io_size;
521 wbuf->used = 0;
522 wbuf->next_ino = 0;
523 spin_unlock(&wbuf->lock);
524 } else {
525 spin_lock(&wbuf->lock);
526 wbuf->avail -= aligned_len;
527 wbuf->used += aligned_len;
528 spin_unlock(&wbuf->lock);
529 }
530
531 goto exit;
532 }
533
534 /*
535 * The node is large enough and does not fit entirely within current
536 * minimal I/O unit. We have to fill and flush write-buffer and switch
537 * to the next min. I/O unit.
538 */
539 dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
540 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
541 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
542 c->min_io_size, wbuf->dtype);
543 if (err)
544 goto out;
545
546 offs = wbuf->offs + c->min_io_size;
547 len -= wbuf->avail;
548 aligned_len -= wbuf->avail;
549 written = wbuf->avail;
550
551 /*
552 * The remaining data may take more whole min. I/O units, so write the
553 * remains multiple to min. I/O unit size directly to the flash media.
554 * We align node length to 8-byte boundary because we anyway flash wbuf
555 * if the remaining space is less than 8 bytes.
556 */
557 n = aligned_len >> c->min_io_shift;
558 if (n) {
559 n <<= c->min_io_shift;
560 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
561 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
562 wbuf->dtype);
563 if (err)
564 goto out;
565 offs += n;
566 aligned_len -= n;
567 len -= n;
568 written += n;
569 }
570
571 spin_lock(&wbuf->lock);
572 if (aligned_len)
573 /*
574 * And now we have what's left and what does not take whole
575 * min. I/O unit, so write it to the write-buffer and we are
576 * done.
577 */
578 memcpy(wbuf->buf, buf + written, len);
579
580 wbuf->offs = offs;
581 wbuf->used = aligned_len;
582 wbuf->avail = c->min_io_size - aligned_len;
583 wbuf->next_ino = 0;
584 spin_unlock(&wbuf->lock);
585
586exit:
587 if (wbuf->sync_callback) {
588 int free = c->leb_size - wbuf->offs - wbuf->used;
589
590 err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
591 if (err)
592 goto out;
593 }
594
595 if (wbuf->used)
596 new_wbuf_timer_nolock(wbuf);
597
598 return 0;
599
600out:
601 ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
602 len, wbuf->lnum, wbuf->offs, err);
603 dbg_dump_node(c, buf);
604 dbg_dump_stack();
605 dbg_dump_leb(c, wbuf->lnum);
606 return err;
607}
608
609/**
610 * ubifs_write_node - write node to the media.
611 * @c: UBIFS file-system description object
612 * @buf: the node to write
613 * @len: node length
614 * @lnum: logical eraseblock number
615 * @offs: offset within the logical eraseblock
616 * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
617 *
618 * This function automatically fills node magic number, assigns sequence
619 * number, and calculates node CRC checksum. The length of the @buf buffer has
620 * to be aligned to the minimal I/O unit size. This function automatically
621 * appends padding node and padding bytes if needed. Returns zero in case of
622 * success and a negative error code in case of failure.
623 */
624int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
625 int offs, int dtype)
626{
627 int err, buf_len = ALIGN(len, c->min_io_size);
628
629 dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
630 lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
631 buf_len);
632 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
633 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
634
635 if (c->ro_media)
636 return -EROFS;
637
638 ubifs_prepare_node(c, buf, len, 1);
639 err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
640 if (err) {
641 ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
642 buf_len, lnum, offs, err);
643 dbg_dump_node(c, buf);
644 dbg_dump_stack();
645 }
646
647 return err;
648}
649
650/**
651 * ubifs_read_node_wbuf - read node from the media or write-buffer.
652 * @wbuf: wbuf to check for un-written data
653 * @buf: buffer to read to
654 * @type: node type
655 * @len: node length
656 * @lnum: logical eraseblock number
657 * @offs: offset within the logical eraseblock
658 *
659 * This function reads a node of known type and length, checks it and stores
660 * in @buf. If the node partially or fully sits in the write-buffer, this
661 * function takes data from the buffer, otherwise it reads the flash media.
662 * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
663 * error code in case of failure.
664 */
665int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
666 int lnum, int offs)
667{
668 const struct ubifs_info *c = wbuf->c;
669 int err, rlen, overlap;
670 struct ubifs_ch *ch = buf;
671
672 dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
673 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
674 ubifs_assert(!(offs & 7) && offs < c->leb_size);
675 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
676
677 spin_lock(&wbuf->lock);
678 overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
679 if (!overlap) {
680 /* We may safely unlock the write-buffer and read the data */
681 spin_unlock(&wbuf->lock);
682 return ubifs_read_node(c, buf, type, len, lnum, offs);
683 }
684
685 /* Don't read under wbuf */
686 rlen = wbuf->offs - offs;
687 if (rlen < 0)
688 rlen = 0;
689
690 /* Copy the rest from the write-buffer */
691 memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
692 spin_unlock(&wbuf->lock);
693
694 if (rlen > 0) {
695 /* Read everything that goes before write-buffer */
696 err = ubi_read(c->ubi, lnum, buf, offs, rlen);
697 if (err && err != -EBADMSG) {
698 ubifs_err("failed to read node %d from LEB %d:%d, "
699 "error %d", type, lnum, offs, err);
700 dbg_dump_stack();
701 return err;
702 }
703 }
704
705 if (type != ch->node_type) {
706 ubifs_err("bad node type (%d but expected %d)",
707 ch->node_type, type);
708 goto out;
709 }
710
711 err = ubifs_check_node(c, buf, lnum, offs, 0);
712 if (err) {
713 ubifs_err("expected node type %d", type);
714 return err;
715 }
716
717 rlen = le32_to_cpu(ch->len);
718 if (rlen != len) {
719 ubifs_err("bad node length %d, expected %d", rlen, len);
720 goto out;
721 }
722
723 return 0;
724
725out:
726 ubifs_err("bad node at LEB %d:%d", lnum, offs);
727 dbg_dump_node(c, buf);
728 dbg_dump_stack();
729 return -EINVAL;
730}
731
732/**
733 * ubifs_read_node - read node.
734 * @c: UBIFS file-system description object
735 * @buf: buffer to read to
736 * @type: node type
737 * @len: node length (not aligned)
738 * @lnum: logical eraseblock number
739 * @offs: offset within the logical eraseblock
740 *
741 * This function reads a node of known type and and length, checks it and
742 * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
743 * and a negative error code in case of failure.
744 */
745int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
746 int lnum, int offs)
747{
748 int err, l;
749 struct ubifs_ch *ch = buf;
750
751 dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
752 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
753 ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
754 ubifs_assert(!(offs & 7) && offs < c->leb_size);
755 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
756
757 err = ubi_read(c->ubi, lnum, buf, offs, len);
758 if (err && err != -EBADMSG) {
759 ubifs_err("cannot read node %d from LEB %d:%d, error %d",
760 type, lnum, offs, err);
761 return err;
762 }
763
764 if (type != ch->node_type) {
765 ubifs_err("bad node type (%d but expected %d)",
766 ch->node_type, type);
767 goto out;
768 }
769
770 err = ubifs_check_node(c, buf, lnum, offs, 0);
771 if (err) {
772 ubifs_err("expected node type %d", type);
773 return err;
774 }
775
776 l = le32_to_cpu(ch->len);
777 if (l != len) {
778 ubifs_err("bad node length %d, expected %d", l, len);
779 goto out;
780 }
781
782 return 0;
783
784out:
785 ubifs_err("bad node at LEB %d:%d", lnum, offs);
786 dbg_dump_node(c, buf);
787 dbg_dump_stack();
788 return -EINVAL;
789}
790
791/**
792 * ubifs_wbuf_init - initialize write-buffer.
793 * @c: UBIFS file-system description object
794 * @wbuf: write-buffer to initialize
795 *
796 * This function initializes write buffer. Returns zero in case of success
797 * %-ENOMEM in case of failure.
798 */
799int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
800{
801 size_t size;
802
803 wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
804 if (!wbuf->buf)
805 return -ENOMEM;
806
807 size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
808 wbuf->inodes = kmalloc(size, GFP_KERNEL);
809 if (!wbuf->inodes) {
810 kfree(wbuf->buf);
811 wbuf->buf = NULL;
812 return -ENOMEM;
813 }
814
815 wbuf->used = 0;
816 wbuf->lnum = wbuf->offs = -1;
817 wbuf->avail = c->min_io_size;
818 wbuf->dtype = UBI_UNKNOWN;
819 wbuf->sync_callback = NULL;
820 mutex_init(&wbuf->io_mutex);
821 spin_lock_init(&wbuf->lock);
822
823 wbuf->c = c;
824 init_timer(&wbuf->timer);
825 wbuf->timer.function = wbuf_timer_callback_nolock;
826 wbuf->timer.data = (unsigned long)wbuf;
827 wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
828 wbuf->next_ino = 0;
829
830 return 0;
831}
832
833/**
834 * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
835 * @wbuf: the write-buffer whereto add
836 * @inum: the inode number
837 *
838 * This function adds an inode number to the inode array of the write-buffer.
839 */
840void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
841{
842 if (!wbuf->buf)
843 /* NOR flash or something similar */
844 return;
845
846 spin_lock(&wbuf->lock);
847 if (wbuf->used)
848 wbuf->inodes[wbuf->next_ino++] = inum;
849 spin_unlock(&wbuf->lock);
850}
851
852/**
853 * wbuf_has_ino - returns if the wbuf contains data from the inode.
854 * @wbuf: the write-buffer
855 * @inum: the inode number
856 *
857 * This function returns with %1 if the write-buffer contains some data from the
858 * given inode otherwise it returns with %0.
859 */
860static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
861{
862 int i, ret = 0;
863
864 spin_lock(&wbuf->lock);
865 for (i = 0; i < wbuf->next_ino; i++)
866 if (inum == wbuf->inodes[i]) {
867 ret = 1;
868 break;
869 }
870 spin_unlock(&wbuf->lock);
871
872 return ret;
873}
874
875/**
876 * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
877 * @c: UBIFS file-system description object
878 * @inode: inode to synchronize
879 *
880 * This function synchronizes write-buffers which contain nodes belonging to
881 * @inode. Returns zero in case of success and a negative error code in case of
882 * failure.
883 */
884int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
885{
886 int i, err = 0;
887
888 for (i = 0; i < c->jhead_cnt; i++) {
889 struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
890
891 if (i == GCHD)
892 /*
893 * GC head is special, do not look at it. Even if the
894 * head contains something related to this inode, it is
895 * a _copy_ of corresponding on-flash node which sits
896 * somewhere else.
897 */
898 continue;
899
900 if (!wbuf_has_ino(wbuf, inode->i_ino))
901 continue;
902
903 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
904 if (wbuf_has_ino(wbuf, inode->i_ino))
905 err = ubifs_wbuf_sync_nolock(wbuf);
906 mutex_unlock(&wbuf->io_mutex);
907
908 if (err) {
909 ubifs_ro_mode(c, err);
910 return err;
911 }
912 }
913 return 0;
914}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
new file mode 100644
index 000000000000..5e82cffe9695
--- /dev/null
+++ b/fs/ubifs/ioctl.c
@@ -0,0 +1,204 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 * Copyright (C) 2006, 2007 University of Szeged, Hungary
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published by
9 * the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 51
18 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Authors: Zoltan Sogor
21 * Artem Bityutskiy (Битюцкий Артём)
22 * Adrian Hunter
23 */
24
25/* This file implements EXT2-compatible extended attribute ioctl() calls */
26
27#include <linux/compat.h>
28#include <linux/smp_lock.h>
29#include <linux/mount.h>
30#include "ubifs.h"
31
32/**
33 * ubifs_set_inode_flags - set VFS inode flags.
34 * @inode: VFS inode to set flags for
35 *
36 * This function propagates flags from UBIFS inode object to VFS inode object.
37 */
38void ubifs_set_inode_flags(struct inode *inode)
39{
40 unsigned int flags = ubifs_inode(inode)->flags;
41
42 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
43 if (flags & UBIFS_SYNC_FL)
44 inode->i_flags |= S_SYNC;
45 if (flags & UBIFS_APPEND_FL)
46 inode->i_flags |= S_APPEND;
47 if (flags & UBIFS_IMMUTABLE_FL)
48 inode->i_flags |= S_IMMUTABLE;
49 if (flags & UBIFS_DIRSYNC_FL)
50 inode->i_flags |= S_DIRSYNC;
51}
52
53/*
54 * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
55 * @ioctl_flags: flags to convert
56 *
57 * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
58 * (@UBIFS_COMPR_FL, etc).
59 */
60static int ioctl2ubifs(int ioctl_flags)
61{
62 int ubifs_flags = 0;
63
64 if (ioctl_flags & FS_COMPR_FL)
65 ubifs_flags |= UBIFS_COMPR_FL;
66 if (ioctl_flags & FS_SYNC_FL)
67 ubifs_flags |= UBIFS_SYNC_FL;
68 if (ioctl_flags & FS_APPEND_FL)
69 ubifs_flags |= UBIFS_APPEND_FL;
70 if (ioctl_flags & FS_IMMUTABLE_FL)
71 ubifs_flags |= UBIFS_IMMUTABLE_FL;
72 if (ioctl_flags & FS_DIRSYNC_FL)
73 ubifs_flags |= UBIFS_DIRSYNC_FL;
74
75 return ubifs_flags;
76}
77
78/*
79 * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
80 * @ubifs_flags: flags to convert
81 *
82 * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
83 * (@FS_COMPR_FL, etc).
84 */
85static int ubifs2ioctl(int ubifs_flags)
86{
87 int ioctl_flags = 0;
88
89 if (ubifs_flags & UBIFS_COMPR_FL)
90 ioctl_flags |= FS_COMPR_FL;
91 if (ubifs_flags & UBIFS_SYNC_FL)
92 ioctl_flags |= FS_SYNC_FL;
93 if (ubifs_flags & UBIFS_APPEND_FL)
94 ioctl_flags |= FS_APPEND_FL;
95 if (ubifs_flags & UBIFS_IMMUTABLE_FL)
96 ioctl_flags |= FS_IMMUTABLE_FL;
97 if (ubifs_flags & UBIFS_DIRSYNC_FL)
98 ioctl_flags |= FS_DIRSYNC_FL;
99
100 return ioctl_flags;
101}
102
103static int setflags(struct inode *inode, int flags)
104{
105 int oldflags, err, release;
106 struct ubifs_inode *ui = ubifs_inode(inode);
107 struct ubifs_info *c = inode->i_sb->s_fs_info;
108 struct ubifs_budget_req req = { .dirtied_ino = 1,
109 .dirtied_ino_d = ui->data_len };
110
111 err = ubifs_budget_space(c, &req);
112 if (err)
113 return err;
114
115 /*
116 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
117 * the relevant capability.
118 */
119 mutex_lock(&ui->ui_mutex);
120 oldflags = ubifs2ioctl(ui->flags);
121 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
122 if (!capable(CAP_LINUX_IMMUTABLE)) {
123 err = -EPERM;
124 goto out_unlock;
125 }
126 }
127
128 ui->flags = ioctl2ubifs(flags);
129 ubifs_set_inode_flags(inode);
130 inode->i_ctime = ubifs_current_time(inode);
131 release = ui->dirty;
132 mark_inode_dirty_sync(inode);
133 mutex_unlock(&ui->ui_mutex);
134
135 if (release)
136 ubifs_release_budget(c, &req);
137 if (IS_SYNC(inode))
138 err = write_inode_now(inode, 1);
139 return err;
140
141out_unlock:
142 ubifs_err("can't modify inode %lu attributes", inode->i_ino);
143 mutex_unlock(&ui->ui_mutex);
144 ubifs_release_budget(c, &req);
145 return err;
146}
147
148long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
149{
150 int flags, err;
151 struct inode *inode = file->f_path.dentry->d_inode;
152
153 switch (cmd) {
154 case FS_IOC_GETFLAGS:
155 flags = ubifs2ioctl(ubifs_inode(inode)->flags);
156
157 return put_user(flags, (int __user *) arg);
158
159 case FS_IOC_SETFLAGS: {
160 if (IS_RDONLY(inode))
161 return -EROFS;
162
163 if (!is_owner_or_cap(inode))
164 return -EACCES;
165
166 if (get_user(flags, (int __user *) arg))
167 return -EFAULT;
168
169 if (!S_ISDIR(inode->i_mode))
170 flags &= ~FS_DIRSYNC_FL;
171
172 /*
173 * Make sure the file-system is read-write and make sure it
174 * will not become read-only while we are changing the flags.
175 */
176 err = mnt_want_write(file->f_path.mnt);
177 if (err)
178 return err;
179 err = setflags(inode, flags);
180 mnt_drop_write(file->f_path.mnt);
181 return err;
182 }
183
184 default:
185 return -ENOTTY;
186 }
187}
188
189#ifdef CONFIG_COMPAT
190long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
191{
192 switch (cmd) {
193 case FS_IOC32_GETFLAGS:
194 cmd = FS_IOC_GETFLAGS;
195 break;
196 case FS_IOC32_SETFLAGS:
197 cmd = FS_IOC_SETFLAGS;
198 break;
199 default:
200 return -ENOIOCTLCMD;
201 }
202 return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
203}
204#endif
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
new file mode 100644
index 000000000000..283155abe5f5
--- /dev/null
+++ b/fs/ubifs/journal.c
@@ -0,0 +1,1387 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS journal.
25 *
26 * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
27 * length and position, while a bud logical eraseblock is any LEB in the main
28 * area. Buds contain file system data - data nodes, inode nodes, etc. The log
29 * contains only references to buds and some other stuff like commit
30 * start node. The idea is that when we commit the journal, we do
31 * not copy the data, the buds just become indexed. Since after the commit the
32 * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
33 * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
34 * become leafs in the future.
35 *
36 * The journal is multi-headed because we want to write data to the journal as
37 * optimally as possible. It is nice to have nodes belonging to the same inode
38 * in one LEB, so we may write data owned by different inodes to different
39 * journal heads, although at present only one data head is used.
40 *
41 * For recovery reasons, the base head contains all inode nodes, all directory
42 * entry nodes and all truncate nodes. This means that the other heads contain
43 * only data nodes.
44 *
45 * Bud LEBs may be half-indexed. For example, if the bud was not full at the
46 * time of commit, the bud is retained to continue to be used in the journal,
47 * even though the "front" of the LEB is now indexed. In that case, the log
48 * reference contains the offset where the bud starts for the purposes of the
49 * journal.
50 *
51 * The journal size has to be limited, because the larger is the journal, the
52 * longer it takes to mount UBIFS (scanning the journal) and the more memory it
53 * takes (indexing in the TNC).
54 *
55 * All the journal write operations like 'ubifs_jnl_update()' here, which write
56 * multiple UBIFS nodes to the journal at one go, are atomic with respect to
57 * unclean reboots. Should the unclean reboot happen, the recovery code drops
58 * all the nodes.
59 */
60
61#include "ubifs.h"
62
63/**
64 * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
65 * @ino: the inode to zero out
66 */
67static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
68{
69 memset(ino->padding1, 0, 4);
70 memset(ino->padding2, 0, 26);
71}
72
73/**
74 * zero_dent_node_unused - zero out unused fields of an on-flash directory
75 * entry node.
76 * @dent: the directory entry to zero out
77 */
78static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
79{
80 dent->padding1 = 0;
81 memset(dent->padding2, 0, 4);
82}
83
84/**
85 * zero_data_node_unused - zero out unused fields of an on-flash data node.
86 * @data: the data node to zero out
87 */
88static inline void zero_data_node_unused(struct ubifs_data_node *data)
89{
90 memset(data->padding, 0, 2);
91}
92
93/**
94 * zero_trun_node_unused - zero out unused fields of an on-flash truncation
95 * node.
96 * @trun: the truncation node to zero out
97 */
98static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
99{
100 memset(trun->padding, 0, 12);
101}
102
103/**
104 * reserve_space - reserve space in the journal.
105 * @c: UBIFS file-system description object
106 * @jhead: journal head number
107 * @len: node length
108 *
109 * This function reserves space in journal head @head. If the reservation
110 * succeeded, the journal head stays locked and later has to be unlocked using
111 * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
112 * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
113 * other negative error codes in case of other failures.
114 */
115static int reserve_space(struct ubifs_info *c, int jhead, int len)
116{
117 int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
119
120 /*
121 * Typically, the base head has smaller nodes written to it, so it is
122 * better to try to allocate space at the ends of eraseblocks. This is
123 * what the squeeze parameter does.
124 */
125 squeeze = (jhead == BASEHD);
126again:
127 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
128
129 if (c->ro_media) {
130 err = -EROFS;
131 goto out_unlock;
132 }
133
134 avail = c->leb_size - wbuf->offs - wbuf->used;
135 if (wbuf->lnum != -1 && avail >= len)
136 return 0;
137
138 /*
139 * Write buffer wasn't seek'ed or there is no enough space - look for an
140 * LEB with some empty space.
141 */
142 lnum = ubifs_find_free_space(c, len, &free, squeeze);
143 if (lnum >= 0) {
144 /* Found an LEB, add it to the journal head */
145 offs = c->leb_size - free;
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err)
148 goto out_return;
149 /* A new bud was successfully allocated and added to the log */
150 goto out;
151 }
152
153 err = lnum;
154 if (err != -ENOSPC)
155 goto out_unlock;
156
157 /*
158 * No free space, we have to run garbage collector to make
159 * some. But the write-buffer mutex has to be unlocked because
160 * GC also takes it.
161 */
162 dbg_jnl("no free space jhead %d, run GC", jhead);
163 mutex_unlock(&wbuf->io_mutex);
164
165 lnum = ubifs_garbage_collect(c, 0);
166 if (lnum < 0) {
167 err = lnum;
168 if (err != -ENOSPC)
169 return err;
170
171 /*
172 * GC could not make a free LEB. But someone else may
173 * have allocated new bud for this journal head,
174 * because we dropped @wbuf->io_mutex, so try once
175 * again.
176 */
177 dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
178 if (retries++ < 2) {
179 dbg_jnl("retry (%d)", retries);
180 goto again;
181 }
182
183 dbg_jnl("return -ENOSPC");
184 return err;
185 }
186
187 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
188 dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
189 avail = c->leb_size - wbuf->offs - wbuf->used;
190
191 if (wbuf->lnum != -1 && avail >= len) {
192 /*
193 * Someone else has switched the journal head and we have
194 * enough space now. This happens when more then one process is
195 * trying to write to the same journal head at the same time.
196 */
197 dbg_jnl("return LEB %d back, already have LEB %d:%d",
198 lnum, wbuf->lnum, wbuf->offs + wbuf->used);
199 err = ubifs_return_leb(c, lnum);
200 if (err)
201 goto out_unlock;
202 return 0;
203 }
204
205 err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
206 if (err)
207 goto out_return;
208 offs = 0;
209
210out:
211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
212 if (err)
213 goto out_unlock;
214
215 return 0;
216
217out_unlock:
218 mutex_unlock(&wbuf->io_mutex);
219 return err;
220
221out_return:
222 /* An error occurred and the LEB has to be returned to lprops */
223 ubifs_assert(err < 0);
224 err1 = ubifs_return_leb(c, lnum);
225 if (err1 && err == -EAGAIN)
226 /*
227 * Return original error code only if it is not %-EAGAIN,
228 * which is not really an error. Otherwise, return the error
229 * code of 'ubifs_return_leb()'.
230 */
231 err = err1;
232 mutex_unlock(&wbuf->io_mutex);
233 return err;
234}
235
236/**
237 * write_node - write node to a journal head.
238 * @c: UBIFS file-system description object
239 * @jhead: journal head
240 * @node: node to write
241 * @len: node length
242 * @lnum: LEB number written is returned here
243 * @offs: offset written is returned here
244 *
245 * This function writes a node to reserved space of journal head @jhead.
246 * Returns zero in case of success and a negative error code in case of
247 * failure.
248 */
249static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
250 int *lnum, int *offs)
251{
252 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
253
254 ubifs_assert(jhead != GCHD);
255
256 *lnum = c->jheads[jhead].wbuf.lnum;
257 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
258
259 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
260 ubifs_prepare_node(c, node, len, 0);
261
262 return ubifs_wbuf_write_nolock(wbuf, node, len);
263}
264
265/**
266 * write_head - write data to a journal head.
267 * @c: UBIFS file-system description object
268 * @jhead: journal head
269 * @buf: buffer to write
270 * @len: length to write
271 * @lnum: LEB number written is returned here
272 * @offs: offset written is returned here
273 * @sync: non-zero if the write-buffer has to by synchronized
274 *
275 * This function is the same as 'write_node()' but it does not assume the
276 * buffer it is writing is a node, so it does not prepare it (which means
277 * initializing common header and calculating CRC).
278 */
279static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
280 int *lnum, int *offs, int sync)
281{
282 int err;
283 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
284
285 ubifs_assert(jhead != GCHD);
286
287 *lnum = c->jheads[jhead].wbuf.lnum;
288 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
289 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
290
291 err = ubifs_wbuf_write_nolock(wbuf, buf, len);
292 if (err)
293 return err;
294 if (sync)
295 err = ubifs_wbuf_sync_nolock(wbuf);
296 return err;
297}
298
299/**
300 * make_reservation - reserve journal space.
301 * @c: UBIFS file-system description object
302 * @jhead: journal head
303 * @len: how many bytes to reserve
304 *
305 * This function makes space reservation in journal head @jhead. The function
306 * takes the commit lock and locks the journal head, and the caller has to
307 * unlock the head and finish the reservation with 'finish_reservation()'.
308 * Returns zero in case of success and a negative error code in case of
309 * failure.
310 *
311 * Note, the journal head may be unlocked as soon as the data is written, while
312 * the commit lock has to be released after the data has been added to the
313 * TNC.
314 */
315static int make_reservation(struct ubifs_info *c, int jhead, int len)
316{
317 int err, cmt_retries = 0, nospc_retries = 0;
318
319again:
320 down_read(&c->commit_sem);
321 err = reserve_space(c, jhead, len);
322 if (!err)
323 return 0;
324 up_read(&c->commit_sem);
325
326 if (err == -ENOSPC) {
327 /*
328 * GC could not make any progress. We should try to commit
329 * once because it could make some dirty space and GC would
330 * make progress, so make the error -EAGAIN so that the below
331 * will commit and re-try.
332 */
333 if (nospc_retries++ < 2) {
334 dbg_jnl("no space, retry");
335 err = -EAGAIN;
336 }
337
338 /*
339 * This means that the budgeting is incorrect. We always have
340 * to be able to write to the media, because all operations are
341 * budgeted. Deletions are not budgeted, though, but we reserve
342 * an extra LEB for them.
343 */
344 }
345
346 if (err != -EAGAIN)
347 goto out;
348
349 /*
350 * -EAGAIN means that the journal is full or too large, or the above
351 * code wants to do one commit. Do this and re-try.
352 */
353 if (cmt_retries > 128) {
354 /*
355 * This should not happen unless the journal size limitations
356 * are too tough.
357 */
358 ubifs_err("stuck in space allocation");
359 err = -ENOSPC;
360 goto out;
361 } else if (cmt_retries > 32)
362 ubifs_warn("too many space allocation re-tries (%d)",
363 cmt_retries);
364
365 dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
366 cmt_retries);
367 cmt_retries += 1;
368
369 err = ubifs_run_commit(c);
370 if (err)
371 return err;
372 goto again;
373
374out:
375 ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
376 len, jhead, err);
377 if (err == -ENOSPC) {
378 /* This are some budgeting problems, print useful information */
379 down_write(&c->commit_sem);
380 spin_lock(&c->space_lock);
381 dbg_dump_stack();
382 dbg_dump_budg(c);
383 spin_unlock(&c->space_lock);
384 dbg_dump_lprops(c);
385 cmt_retries = dbg_check_lprops(c);
386 up_write(&c->commit_sem);
387 }
388 return err;
389}
390
391/**
392 * release_head - release a journal head.
393 * @c: UBIFS file-system description object
394 * @jhead: journal head
395 *
396 * This function releases journal head @jhead which was locked by
397 * the 'make_reservation()' function. It has to be called after each successful
398 * 'make_reservation()' invocation.
399 */
400static inline void release_head(struct ubifs_info *c, int jhead)
401{
402 mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
403}
404
405/**
406 * finish_reservation - finish a reservation.
407 * @c: UBIFS file-system description object
408 *
409 * This function finishes journal space reservation. It must be called after
410 * 'make_reservation()'.
411 */
412static void finish_reservation(struct ubifs_info *c)
413{
414 up_read(&c->commit_sem);
415}
416
417/**
418 * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
419 * @mode: inode mode
420 */
421static int get_dent_type(int mode)
422{
423 switch (mode & S_IFMT) {
424 case S_IFREG:
425 return UBIFS_ITYPE_REG;
426 case S_IFDIR:
427 return UBIFS_ITYPE_DIR;
428 case S_IFLNK:
429 return UBIFS_ITYPE_LNK;
430 case S_IFBLK:
431 return UBIFS_ITYPE_BLK;
432 case S_IFCHR:
433 return UBIFS_ITYPE_CHR;
434 case S_IFIFO:
435 return UBIFS_ITYPE_FIFO;
436 case S_IFSOCK:
437 return UBIFS_ITYPE_SOCK;
438 default:
439 BUG();
440 }
441 return 0;
442}
443
444/**
445 * pack_inode - pack an inode node.
446 * @c: UBIFS file-system description object
447 * @ino: buffer in which to pack inode node
448 * @inode: inode to pack
449 * @last: indicates the last node of the group
450 * @last_reference: non-zero if this is a deletion inode
451 */
452static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
453 const struct inode *inode, int last,
454 int last_reference)
455{
456 int data_len = 0;
457 struct ubifs_inode *ui = ubifs_inode(inode);
458
459 ino->ch.node_type = UBIFS_INO_NODE;
460 ino_key_init_flash(c, &ino->key, inode->i_ino);
461 ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
462 ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
463 ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
464 ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec);
465 ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
466 ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
467 ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
468 ino->uid = cpu_to_le32(inode->i_uid);
469 ino->gid = cpu_to_le32(inode->i_gid);
470 ino->mode = cpu_to_le32(inode->i_mode);
471 ino->flags = cpu_to_le32(ui->flags);
472 ino->size = cpu_to_le64(ui->ui_size);
473 ino->nlink = cpu_to_le32(inode->i_nlink);
474 ino->compr_type = cpu_to_le16(ui->compr_type);
475 ino->data_len = cpu_to_le32(ui->data_len);
476 ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt);
477 ino->xattr_size = cpu_to_le32(ui->xattr_size);
478 ino->xattr_names = cpu_to_le32(ui->xattr_names);
479 zero_ino_node_unused(ino);
480
481 /*
482 * Drop the attached data if this is a deletion inode, the data is not
483 * needed anymore.
484 */
485 if (!last_reference) {
486 memcpy(ino->data, ui->data, ui->data_len);
487 data_len = ui->data_len;
488 }
489
490 ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
491}
492
493/**
494 * mark_inode_clean - mark UBIFS inode as clean.
495 * @c: UBIFS file-system description object
496 * @ui: UBIFS inode to mark as clean
497 *
498 * This helper function marks UBIFS inode @ui as clean by cleaning the
499 * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
500 * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
501 * just do nothing.
502 */
503static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
504{
505 if (ui->dirty)
506 ubifs_release_dirty_inode_budget(c, ui);
507 ui->dirty = 0;
508}
509
510/**
511 * ubifs_jnl_update - update inode.
512 * @c: UBIFS file-system description object
513 * @dir: parent inode or host inode in case of extended attributes
514 * @nm: directory entry name
515 * @inode: inode to update
516 * @deletion: indicates a directory entry deletion i.e unlink or rmdir
517 * @xent: non-zero if the directory entry is an extended attribute entry
518 *
519 * This function updates an inode by writing a directory entry (or extended
520 * attribute entry), the inode itself, and the parent directory inode (or the
521 * host inode) to the journal.
522 *
523 * The function writes the host inode @dir last, which is important in case of
524 * extended attributes. Indeed, then we guarantee that if the host inode gets
525 * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
526 * the extended attribute inode gets flushed too. And this is exactly what the
527 * user expects - synchronizing the host inode synchronizes its extended
528 * attributes. Similarly, this guarantees that if @dir is synchronized, its
529 * directory entry corresponding to @nm gets synchronized too.
530 *
531 * If the inode (@inode) or the parent directory (@dir) are synchronous, this
532 * function synchronizes the write-buffer.
533 *
534 * This function marks the @dir and @inode inodes as clean and returns zero on
535 * success. In case of failure, a negative error code is returned.
536 */
537int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
538 const struct qstr *nm, const struct inode *inode,
539 int deletion, int xent)
540{
541 int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
542 int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
543 int last_reference = !!(deletion && inode->i_nlink == 0);
544 struct ubifs_inode *ui = ubifs_inode(inode);
545 struct ubifs_inode *dir_ui = ubifs_inode(dir);
546 struct ubifs_dent_node *dent;
547 struct ubifs_ino_node *ino;
548 union ubifs_key dent_key, ino_key;
549
550 dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
551 inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
552 ubifs_assert(dir_ui->data_len == 0);
553 ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
554
555 dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
556 ilen = UBIFS_INO_NODE_SZ;
557
558 /*
559 * If the last reference to the inode is being deleted, then there is
560 * no need to attach and write inode data, it is being deleted anyway.
561 * And if the inode is being deleted, no need to synchronize
562 * write-buffer even if the inode is synchronous.
563 */
564 if (!last_reference) {
565 ilen += ui->data_len;
566 sync |= IS_SYNC(inode);
567 }
568
569 aligned_dlen = ALIGN(dlen, 8);
570 aligned_ilen = ALIGN(ilen, 8);
571 len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
572 dent = kmalloc(len, GFP_NOFS);
573 if (!dent)
574 return -ENOMEM;
575
576 /* Make reservation before allocating sequence numbers */
577 err = make_reservation(c, BASEHD, len);
578 if (err)
579 goto out_free;
580
581 if (!xent) {
582 dent->ch.node_type = UBIFS_DENT_NODE;
583 dent_key_init(c, &dent_key, dir->i_ino, nm);
584 } else {
585 dent->ch.node_type = UBIFS_XENT_NODE;
586 xent_key_init(c, &dent_key, dir->i_ino, nm);
587 }
588
589 key_write(c, &dent_key, dent->key);
590 dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
591 dent->type = get_dent_type(inode->i_mode);
592 dent->nlen = cpu_to_le16(nm->len);
593 memcpy(dent->name, nm->name, nm->len);
594 dent->name[nm->len] = '\0';
595 zero_dent_node_unused(dent);
596 ubifs_prep_grp_node(c, dent, dlen, 0);
597
598 ino = (void *)dent + aligned_dlen;
599 pack_inode(c, ino, inode, 0, last_reference);
600 ino = (void *)ino + aligned_ilen;
601 pack_inode(c, ino, dir, 1, 0);
602
603 if (last_reference) {
604 err = ubifs_add_orphan(c, inode->i_ino);
605 if (err) {
606 release_head(c, BASEHD);
607 goto out_finish;
608 }
609 }
610
611 err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
612 if (err)
613 goto out_release;
614 if (!sync) {
615 struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
616
617 ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
618 ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
619 }
620 release_head(c, BASEHD);
621 kfree(dent);
622
623 if (deletion) {
624 err = ubifs_tnc_remove_nm(c, &dent_key, nm);
625 if (err)
626 goto out_ro;
627 err = ubifs_add_dirt(c, lnum, dlen);
628 } else
629 err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
630 if (err)
631 goto out_ro;
632
633 /*
634 * Note, we do not remove the inode from TNC even if the last reference
635 * to it has just been deleted, because the inode may still be opened.
636 * Instead, the inode has been added to orphan lists and the orphan
637 * subsystem will take further care about it.
638 */
639 ino_key_init(c, &ino_key, inode->i_ino);
640 ino_offs = dent_offs + aligned_dlen;
641 err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
642 if (err)
643 goto out_ro;
644
645 ino_key_init(c, &ino_key, dir->i_ino);
646 ino_offs += aligned_ilen;
647 err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
648 if (err)
649 goto out_ro;
650
651 finish_reservation(c);
652 spin_lock(&ui->ui_lock);
653 ui->synced_i_size = ui->ui_size;
654 spin_unlock(&ui->ui_lock);
655 mark_inode_clean(c, ui);
656 mark_inode_clean(c, dir_ui);
657 return 0;
658
659out_finish:
660 finish_reservation(c);
661out_free:
662 kfree(dent);
663 return err;
664
665out_release:
666 release_head(c, BASEHD);
667out_ro:
668 ubifs_ro_mode(c, err);
669 if (last_reference)
670 ubifs_delete_orphan(c, inode->i_ino);
671 finish_reservation(c);
672 return err;
673}
674
675/**
676 * ubifs_jnl_write_data - write a data node to the journal.
677 * @c: UBIFS file-system description object
678 * @inode: inode the data node belongs to
679 * @key: node key
680 * @buf: buffer to write
681 * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
682 *
683 * This function writes a data node to the journal. Returns %0 if the data node
684 * was successfully written, and a negative error code in case of failure.
685 */
686int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
687 const union ubifs_key *key, const void *buf, int len)
688{
689 struct ubifs_data_node *data;
690 int err, lnum, offs, compr_type, out_len;
691 int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
692 struct ubifs_inode *ui = ubifs_inode(inode);
693
694 dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
695 key_block(c, key), len, DBGKEY(key));
696 ubifs_assert(len <= UBIFS_BLOCK_SIZE);
697
698 data = kmalloc(dlen, GFP_NOFS);
699 if (!data)
700 return -ENOMEM;
701
702 data->ch.node_type = UBIFS_DATA_NODE;
703 key_write(c, key, &data->key);
704 data->size = cpu_to_le32(len);
705 zero_data_node_unused(data);
706
707 if (!(ui->flags && UBIFS_COMPR_FL))
708 /* Compression is disabled for this inode */
709 compr_type = UBIFS_COMPR_NONE;
710 else
711 compr_type = ui->compr_type;
712
713 out_len = dlen - UBIFS_DATA_NODE_SZ;
714 ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
715 ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
716
717 dlen = UBIFS_DATA_NODE_SZ + out_len;
718 data->compr_type = cpu_to_le16(compr_type);
719
720 /* Make reservation before allocating sequence numbers */
721 err = make_reservation(c, DATAHD, dlen);
722 if (err)
723 goto out_free;
724
725 err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
726 if (err)
727 goto out_release;
728 ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
729 release_head(c, DATAHD);
730
731 err = ubifs_tnc_add(c, key, lnum, offs, dlen);
732 if (err)
733 goto out_ro;
734
735 finish_reservation(c);
736 kfree(data);
737 return 0;
738
739out_release:
740 release_head(c, DATAHD);
741out_ro:
742 ubifs_ro_mode(c, err);
743 finish_reservation(c);
744out_free:
745 kfree(data);
746 return err;
747}
748
749/**
750 * ubifs_jnl_write_inode - flush inode to the journal.
751 * @c: UBIFS file-system description object
752 * @inode: inode to flush
753 * @deletion: inode has been deleted
754 *
755 * This function writes inode @inode to the journal. If the inode is
756 * synchronous, it also synchronizes the write-buffer. Returns zero in case of
757 * success and a negative error code in case of failure.
758 */
759int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
760 int deletion)
761{
762 int err, len, lnum, offs, sync = 0;
763 struct ubifs_ino_node *ino;
764 struct ubifs_inode *ui = ubifs_inode(inode);
765
766 dbg_jnl("ino %lu%s", inode->i_ino,
767 deletion ? " (last reference)" : "");
768 if (deletion)
769 ubifs_assert(inode->i_nlink == 0);
770
771 len = UBIFS_INO_NODE_SZ;
772 /*
773 * If the inode is being deleted, do not write the attached data. No
774 * need to synchronize the write-buffer either.
775 */
776 if (!deletion) {
777 len += ui->data_len;
778 sync = IS_SYNC(inode);
779 }
780 ino = kmalloc(len, GFP_NOFS);
781 if (!ino)
782 return -ENOMEM;
783
784 /* Make reservation before allocating sequence numbers */
785 err = make_reservation(c, BASEHD, len);
786 if (err)
787 goto out_free;
788
789 pack_inode(c, ino, inode, 1, deletion);
790 err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
791 if (err)
792 goto out_release;
793 if (!sync)
794 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
795 inode->i_ino);
796 release_head(c, BASEHD);
797
798 if (deletion) {
799 err = ubifs_tnc_remove_ino(c, inode->i_ino);
800 if (err)
801 goto out_ro;
802 ubifs_delete_orphan(c, inode->i_ino);
803 err = ubifs_add_dirt(c, lnum, len);
804 } else {
805 union ubifs_key key;
806
807 ino_key_init(c, &key, inode->i_ino);
808 err = ubifs_tnc_add(c, &key, lnum, offs, len);
809 }
810 if (err)
811 goto out_ro;
812
813 finish_reservation(c);
814 spin_lock(&ui->ui_lock);
815 ui->synced_i_size = ui->ui_size;
816 spin_unlock(&ui->ui_lock);
817 kfree(ino);
818 return 0;
819
820out_release:
821 release_head(c, BASEHD);
822out_ro:
823 ubifs_ro_mode(c, err);
824 finish_reservation(c);
825out_free:
826 kfree(ino);
827 return err;
828}
829
830/**
831 * ubifs_jnl_rename - rename a directory entry.
832 * @c: UBIFS file-system description object
833 * @old_dir: parent inode of directory entry to rename
834 * @old_dentry: directory entry to rename
835 * @new_dir: parent inode of directory entry to rename
836 * @new_dentry: new directory entry (or directory entry to replace)
837 * @sync: non-zero if the write-buffer has to be synchronized
838 *
839 * This function implements the re-name operation which may involve writing up
840 * to 3 inodes and 2 directory entries. It marks the written inodes as clean
841 * and returns zero on success. In case of failure, a negative error code is
842 * returned.
843 */
844int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
845 const struct dentry *old_dentry,
846 const struct inode *new_dir,
847 const struct dentry *new_dentry, int sync)
848{
849 void *p;
850 union ubifs_key key;
851 struct ubifs_dent_node *dent, *dent2;
852 int err, dlen1, dlen2, ilen, lnum, offs, len;
853 const struct inode *old_inode = old_dentry->d_inode;
854 const struct inode *new_inode = new_dentry->d_inode;
855 int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
856 int last_reference = !!(new_inode && new_inode->i_nlink == 0);
857 int move = (old_dir != new_dir);
858 struct ubifs_inode *uninitialized_var(new_ui);
859
860 dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
861 old_dentry->d_name.len, old_dentry->d_name.name,
862 old_dir->i_ino, new_dentry->d_name.len,
863 new_dentry->d_name.name, new_dir->i_ino);
864 ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
865 ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
866 ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
867 ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
868
869 dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
870 dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
871 if (new_inode) {
872 new_ui = ubifs_inode(new_inode);
873 ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
874 ilen = UBIFS_INO_NODE_SZ;
875 if (!last_reference)
876 ilen += new_ui->data_len;
877 } else
878 ilen = 0;
879
880 aligned_dlen1 = ALIGN(dlen1, 8);
881 aligned_dlen2 = ALIGN(dlen2, 8);
882 len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
883 if (old_dir != new_dir)
884 len += plen;
885 dent = kmalloc(len, GFP_NOFS);
886 if (!dent)
887 return -ENOMEM;
888
889 /* Make reservation before allocating sequence numbers */
890 err = make_reservation(c, BASEHD, len);
891 if (err)
892 goto out_free;
893
894 /* Make new dent */
895 dent->ch.node_type = UBIFS_DENT_NODE;
896 dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
897 dent->inum = cpu_to_le64(old_inode->i_ino);
898 dent->type = get_dent_type(old_inode->i_mode);
899 dent->nlen = cpu_to_le16(new_dentry->d_name.len);
900 memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
901 dent->name[new_dentry->d_name.len] = '\0';
902 zero_dent_node_unused(dent);
903 ubifs_prep_grp_node(c, dent, dlen1, 0);
904
905 /* Make deletion dent */
906 dent2 = (void *)dent + aligned_dlen1;
907 dent2->ch.node_type = UBIFS_DENT_NODE;
908 dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
909 &old_dentry->d_name);
910 dent2->inum = 0;
911 dent2->type = DT_UNKNOWN;
912 dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
913 memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
914 dent2->name[old_dentry->d_name.len] = '\0';
915 zero_dent_node_unused(dent2);
916 ubifs_prep_grp_node(c, dent2, dlen2, 0);
917
918 p = (void *)dent2 + aligned_dlen2;
919 if (new_inode) {
920 pack_inode(c, p, new_inode, 0, last_reference);
921 p += ALIGN(ilen, 8);
922 }
923
924 if (!move)
925 pack_inode(c, p, old_dir, 1, 0);
926 else {
927 pack_inode(c, p, old_dir, 0, 0);
928 p += ALIGN(plen, 8);
929 pack_inode(c, p, new_dir, 1, 0);
930 }
931
932 if (last_reference) {
933 err = ubifs_add_orphan(c, new_inode->i_ino);
934 if (err) {
935 release_head(c, BASEHD);
936 goto out_finish;
937 }
938 }
939
940 err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
941 if (err)
942 goto out_release;
943 if (!sync) {
944 struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
945
946 ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
947 ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
948 if (new_inode)
949 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
950 new_inode->i_ino);
951 }
952 release_head(c, BASEHD);
953
954 dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
955 err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
956 if (err)
957 goto out_ro;
958
959 err = ubifs_add_dirt(c, lnum, dlen2);
960 if (err)
961 goto out_ro;
962
963 dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
964 err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
965 if (err)
966 goto out_ro;
967
968 offs += aligned_dlen1 + aligned_dlen2;
969 if (new_inode) {
970 ino_key_init(c, &key, new_inode->i_ino);
971 err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
972 if (err)
973 goto out_ro;
974 offs += ALIGN(ilen, 8);
975 }
976
977 ino_key_init(c, &key, old_dir->i_ino);
978 err = ubifs_tnc_add(c, &key, lnum, offs, plen);
979 if (err)
980 goto out_ro;
981
982 if (old_dir != new_dir) {
983 offs += ALIGN(plen, 8);
984 ino_key_init(c, &key, new_dir->i_ino);
985 err = ubifs_tnc_add(c, &key, lnum, offs, plen);
986 if (err)
987 goto out_ro;
988 }
989
990 finish_reservation(c);
991 if (new_inode) {
992 mark_inode_clean(c, new_ui);
993 spin_lock(&new_ui->ui_lock);
994 new_ui->synced_i_size = new_ui->ui_size;
995 spin_unlock(&new_ui->ui_lock);
996 }
997 mark_inode_clean(c, ubifs_inode(old_dir));
998 if (move)
999 mark_inode_clean(c, ubifs_inode(new_dir));
1000 kfree(dent);
1001 return 0;
1002
1003out_release:
1004 release_head(c, BASEHD);
1005out_ro:
1006 ubifs_ro_mode(c, err);
1007 if (last_reference)
1008 ubifs_delete_orphan(c, new_inode->i_ino);
1009out_finish:
1010 finish_reservation(c);
1011out_free:
1012 kfree(dent);
1013 return err;
1014}
1015
1016/**
1017 * recomp_data_node - re-compress a truncated data node.
1018 * @dn: data node to re-compress
1019 * @new_len: new length
1020 *
1021 * This function is used when an inode is truncated and the last data node of
1022 * the inode has to be re-compressed and re-written.
1023 */
1024static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
1025{
1026 void *buf;
1027 int err, len, compr_type, out_len;
1028
1029 out_len = le32_to_cpu(dn->size);
1030 buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
1031 if (!buf)
1032 return -ENOMEM;
1033
1034 len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
1035 compr_type = le16_to_cpu(dn->compr_type);
1036 err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
1037 if (err)
1038 goto out;
1039
1040 ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
1041 ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
1042 dn->compr_type = cpu_to_le16(compr_type);
1043 dn->size = cpu_to_le32(*new_len);
1044 *new_len = UBIFS_DATA_NODE_SZ + out_len;
1045out:
1046 kfree(buf);
1047 return err;
1048}
1049
1050/**
1051 * ubifs_jnl_truncate - update the journal for a truncation.
1052 * @c: UBIFS file-system description object
1053 * @inode: inode to truncate
1054 * @old_size: old size
1055 * @new_size: new size
1056 *
1057 * When the size of a file decreases due to truncation, a truncation node is
1058 * written, the journal tree is updated, and the last data block is re-written
1059 * if it has been affected. The inode is also updated in order to synchronize
1060 * the new inode size.
1061 *
1062 * This function marks the inode as clean and returns zero on success. In case
1063 * of failure, a negative error code is returned.
1064 */
1065int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1066 loff_t old_size, loff_t new_size)
1067{
1068 union ubifs_key key, to_key;
1069 struct ubifs_ino_node *ino;
1070 struct ubifs_trun_node *trun;
1071 struct ubifs_data_node *uninitialized_var(dn);
1072 int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
1073 struct ubifs_inode *ui = ubifs_inode(inode);
1074 ino_t inum = inode->i_ino;
1075 unsigned int blk;
1076
1077 dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
1078 ubifs_assert(!ui->data_len);
1079 ubifs_assert(S_ISREG(inode->i_mode));
1080 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
1081
1082 sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
1083 UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
1084 ino = kmalloc(sz, GFP_NOFS);
1085 if (!ino)
1086 return -ENOMEM;
1087
1088 trun = (void *)ino + UBIFS_INO_NODE_SZ;
1089 trun->ch.node_type = UBIFS_TRUN_NODE;
1090 trun->inum = cpu_to_le32(inum);
1091 trun->old_size = cpu_to_le64(old_size);
1092 trun->new_size = cpu_to_le64(new_size);
1093 zero_trun_node_unused(trun);
1094
1095 dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
1096 if (dlen) {
1097 /* Get last data block so it can be truncated */
1098 dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
1099 blk = new_size >> UBIFS_BLOCK_SHIFT;
1100 data_key_init(c, &key, inum, blk);
1101 dbg_jnl("last block key %s", DBGKEY(&key));
1102 err = ubifs_tnc_lookup(c, &key, dn);
1103 if (err == -ENOENT)
1104 dlen = 0; /* Not found (so it is a hole) */
1105 else if (err)
1106 goto out_free;
1107 else {
1108 if (le32_to_cpu(dn->size) <= dlen)
1109 dlen = 0; /* Nothing to do */
1110 else {
1111 int compr_type = le16_to_cpu(dn->compr_type);
1112
1113 if (compr_type != UBIFS_COMPR_NONE) {
1114 err = recomp_data_node(dn, &dlen);
1115 if (err)
1116 goto out_free;
1117 } else {
1118 dn->size = cpu_to_le32(dlen);
1119 dlen += UBIFS_DATA_NODE_SZ;
1120 }
1121 zero_data_node_unused(dn);
1122 }
1123 }
1124 }
1125
1126 /* Must make reservation before allocating sequence numbers */
1127 len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
1128 if (dlen)
1129 len += dlen;
1130 err = make_reservation(c, BASEHD, len);
1131 if (err)
1132 goto out_free;
1133
1134 pack_inode(c, ino, inode, 0, 0);
1135 ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
1136 if (dlen)
1137 ubifs_prep_grp_node(c, dn, dlen, 1);
1138
1139 err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
1140 if (err)
1141 goto out_release;
1142 if (!sync)
1143 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
1144 release_head(c, BASEHD);
1145
1146 if (dlen) {
1147 sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
1148 err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
1149 if (err)
1150 goto out_ro;
1151 }
1152
1153 ino_key_init(c, &key, inum);
1154 err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
1155 if (err)
1156 goto out_ro;
1157
1158 err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
1159 if (err)
1160 goto out_ro;
1161
1162 bit = new_size & (UBIFS_BLOCK_SIZE - 1);
1163 blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
1164 data_key_init(c, &key, inum, blk);
1165
1166 bit = old_size & (UBIFS_BLOCK_SIZE - 1);
1167 blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
1168 data_key_init(c, &to_key, inum, blk);
1169
1170 err = ubifs_tnc_remove_range(c, &key, &to_key);
1171 if (err)
1172 goto out_ro;
1173
1174 finish_reservation(c);
1175 spin_lock(&ui->ui_lock);
1176 ui->synced_i_size = ui->ui_size;
1177 spin_unlock(&ui->ui_lock);
1178 mark_inode_clean(c, ui);
1179 kfree(ino);
1180 return 0;
1181
1182out_release:
1183 release_head(c, BASEHD);
1184out_ro:
1185 ubifs_ro_mode(c, err);
1186 finish_reservation(c);
1187out_free:
1188 kfree(ino);
1189 return err;
1190}
1191
1192#ifdef CONFIG_UBIFS_FS_XATTR
1193
1194/**
1195 * ubifs_jnl_delete_xattr - delete an extended attribute.
1196 * @c: UBIFS file-system description object
1197 * @host: host inode
1198 * @inode: extended attribute inode
1199 * @nm: extended attribute entry name
1200 *
1201 * This function delete an extended attribute which is very similar to
1202 * un-linking regular files - it writes a deletion xentry, a deletion inode and
1203 * updates the target inode. Returns zero in case of success and a negative
1204 * error code in case of failure.
1205 */
1206int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
1207 const struct inode *inode, const struct qstr *nm)
1208{
1209 int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
1210 struct ubifs_dent_node *xent;
1211 struct ubifs_ino_node *ino;
1212 union ubifs_key xent_key, key1, key2;
1213 int sync = IS_DIRSYNC(host);
1214 struct ubifs_inode *host_ui = ubifs_inode(host);
1215
1216 dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
1217 host->i_ino, inode->i_ino, nm->name,
1218 ubifs_inode(inode)->data_len);
1219 ubifs_assert(inode->i_nlink == 0);
1220 ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
1221
1222 /*
1223 * Since we are deleting the inode, we do not bother to attach any data
1224 * to it and assume its length is %UBIFS_INO_NODE_SZ.
1225 */
1226 xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
1227 aligned_xlen = ALIGN(xlen, 8);
1228 hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
1229 len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
1230
1231 xent = kmalloc(len, GFP_NOFS);
1232 if (!xent)
1233 return -ENOMEM;
1234
1235 /* Make reservation before allocating sequence numbers */
1236 err = make_reservation(c, BASEHD, len);
1237 if (err) {
1238 kfree(xent);
1239 return err;
1240 }
1241
1242 xent->ch.node_type = UBIFS_XENT_NODE;
1243 xent_key_init(c, &xent_key, host->i_ino, nm);
1244 key_write(c, &xent_key, xent->key);
1245 xent->inum = 0;
1246 xent->type = get_dent_type(inode->i_mode);
1247 xent->nlen = cpu_to_le16(nm->len);
1248 memcpy(xent->name, nm->name, nm->len);
1249 xent->name[nm->len] = '\0';
1250 zero_dent_node_unused(xent);
1251 ubifs_prep_grp_node(c, xent, xlen, 0);
1252
1253 ino = (void *)xent + aligned_xlen;
1254 pack_inode(c, ino, inode, 0, 1);
1255 ino = (void *)ino + UBIFS_INO_NODE_SZ;
1256 pack_inode(c, ino, host, 1, 0);
1257
1258 err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
1259 if (!sync && !err)
1260 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
1261 release_head(c, BASEHD);
1262 kfree(xent);
1263 if (err)
1264 goto out_ro;
1265
1266 /* Remove the extended attribute entry from TNC */
1267 err = ubifs_tnc_remove_nm(c, &xent_key, nm);
1268 if (err)
1269 goto out_ro;
1270 err = ubifs_add_dirt(c, lnum, xlen);
1271 if (err)
1272 goto out_ro;
1273
1274 /*
1275 * Remove all nodes belonging to the extended attribute inode from TNC.
1276 * Well, there actually must be only one node - the inode itself.
1277 */
1278 lowest_ino_key(c, &key1, inode->i_ino);
1279 highest_ino_key(c, &key2, inode->i_ino);
1280 err = ubifs_tnc_remove_range(c, &key1, &key2);
1281 if (err)
1282 goto out_ro;
1283 err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
1284 if (err)
1285 goto out_ro;
1286
1287 /* And update TNC with the new host inode position */
1288 ino_key_init(c, &key1, host->i_ino);
1289 err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
1290 if (err)
1291 goto out_ro;
1292
1293 finish_reservation(c);
1294 spin_lock(&host_ui->ui_lock);
1295 host_ui->synced_i_size = host_ui->ui_size;
1296 spin_unlock(&host_ui->ui_lock);
1297 mark_inode_clean(c, host_ui);
1298 return 0;
1299
1300out_ro:
1301 ubifs_ro_mode(c, err);
1302 finish_reservation(c);
1303 return err;
1304}
1305
1306/**
1307 * ubifs_jnl_change_xattr - change an extended attribute.
1308 * @c: UBIFS file-system description object
1309 * @inode: extended attribute inode
1310 * @host: host inode
1311 *
1312 * This function writes the updated version of an extended attribute inode and
1313 * the host inode tho the journal (to the base head). The host inode is written
1314 * after the extended attribute inode in order to guarantee that the extended
1315 * attribute will be flushed when the inode is synchronized by 'fsync()' and
1316 * consequently, the write-buffer is synchronized. This function returns zero
1317 * in case of success and a negative error code in case of failure.
1318 */
1319int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
1320 const struct inode *host)
1321{
1322 int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
1323 struct ubifs_inode *host_ui = ubifs_inode(inode);
1324 struct ubifs_ino_node *ino;
1325 union ubifs_key key;
1326 int sync = IS_DIRSYNC(host);
1327
1328 dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
1329 ubifs_assert(host->i_nlink > 0);
1330 ubifs_assert(inode->i_nlink > 0);
1331 ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
1332
1333 len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
1334 len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
1335 aligned_len1 = ALIGN(len1, 8);
1336 aligned_len = aligned_len1 + ALIGN(len2, 8);
1337
1338 ino = kmalloc(aligned_len, GFP_NOFS);
1339 if (!ino)
1340 return -ENOMEM;
1341
1342 /* Make reservation before allocating sequence numbers */
1343 err = make_reservation(c, BASEHD, aligned_len);
1344 if (err)
1345 goto out_free;
1346
1347 pack_inode(c, ino, host, 0, 0);
1348 pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
1349
1350 err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
1351 if (!sync && !err) {
1352 struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
1353
1354 ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
1355 ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
1356 }
1357 release_head(c, BASEHD);
1358 if (err)
1359 goto out_ro;
1360
1361 ino_key_init(c, &key, host->i_ino);
1362 err = ubifs_tnc_add(c, &key, lnum, offs, len1);
1363 if (err)
1364 goto out_ro;
1365
1366 ino_key_init(c, &key, inode->i_ino);
1367 err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
1368 if (err)
1369 goto out_ro;
1370
1371 finish_reservation(c);
1372 spin_lock(&host_ui->ui_lock);
1373 host_ui->synced_i_size = host_ui->ui_size;
1374 spin_unlock(&host_ui->ui_lock);
1375 mark_inode_clean(c, host_ui);
1376 kfree(ino);
1377 return 0;
1378
1379out_ro:
1380 ubifs_ro_mode(c, err);
1381 finish_reservation(c);
1382out_free:
1383 kfree(ino);
1384 return err;
1385}
1386
1387#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
new file mode 100644
index 000000000000..8f7476007549
--- /dev/null
+++ b/fs/ubifs/key.h
@@ -0,0 +1,533 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This header contains various key-related definitions and helper function.
25 * UBIFS allows several key schemes, so we access key fields only via these
26 * helpers. At the moment only one key scheme is supported.
27 *
28 * Simple key scheme
29 * ~~~~~~~~~~~~~~~~~
30 *
31 * Keys are 64-bits long. First 32-bits are inode number (parent inode number
32 * in case of direntry key). Next 3 bits are node type. The last 29 bits are
33 * 4KiB offset in case of inode node, and direntry hash in case of a direntry
34 * node. We use "r5" hash borrowed from reiserfs.
35 */
36
37#ifndef __UBIFS_KEY_H__
38#define __UBIFS_KEY_H__
39
40/**
41 * key_r5_hash - R5 hash function (borrowed from reiserfs).
42 * @s: direntry name
43 * @len: name length
44 */
45static inline uint32_t key_r5_hash(const char *s, int len)
46{
47 uint32_t a = 0;
48 const signed char *str = (const signed char *)s;
49
50 while (*str) {
51 a += *str << 4;
52 a += *str >> 4;
53 a *= 11;
54 str++;
55 }
56
57 a &= UBIFS_S_KEY_HASH_MASK;
58
59 /*
60 * We use hash values as offset in directories, so values %0 and %1 are
61 * reserved for "." and "..". %2 is reserved for "end of readdir"
62 * marker.
63 */
64 if (unlikely(a >= 0 && a <= 2))
65 a += 3;
66 return a;
67}
68
69/**
70 * key_test_hash - testing hash function.
71 * @str: direntry name
72 * @len: name length
73 */
74static inline uint32_t key_test_hash(const char *str, int len)
75{
76 uint32_t a = 0;
77
78 len = min_t(uint32_t, len, 4);
79 memcpy(&a, str, len);
80 a &= UBIFS_S_KEY_HASH_MASK;
81 if (unlikely(a >= 0 && a <= 2))
82 a += 3;
83 return a;
84}
85
86/**
87 * ino_key_init - initialize inode key.
88 * @c: UBIFS file-system description object
89 * @key: key to initialize
90 * @inum: inode number
91 */
92static inline void ino_key_init(const struct ubifs_info *c,
93 union ubifs_key *key, ino_t inum)
94{
95 key->u32[0] = inum;
96 key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
97}
98
99/**
100 * ino_key_init_flash - initialize on-flash inode key.
101 * @c: UBIFS file-system description object
102 * @k: key to initialize
103 * @inum: inode number
104 */
105static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
106 ino_t inum)
107{
108 union ubifs_key *key = k;
109
110 key->j32[0] = cpu_to_le32(inum);
111 key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
112 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
113}
114
115/**
116 * lowest_ino_key - get the lowest possible inode key.
117 * @c: UBIFS file-system description object
118 * @key: key to initialize
119 * @inum: inode number
120 */
121static inline void lowest_ino_key(const struct ubifs_info *c,
122 union ubifs_key *key, ino_t inum)
123{
124 key->u32[0] = inum;
125 key->u32[1] = 0;
126}
127
128/**
129 * highest_ino_key - get the highest possible inode key.
130 * @c: UBIFS file-system description object
131 * @key: key to initialize
132 * @inum: inode number
133 */
134static inline void highest_ino_key(const struct ubifs_info *c,
135 union ubifs_key *key, ino_t inum)
136{
137 key->u32[0] = inum;
138 key->u32[1] = 0xffffffff;
139}
140
141/**
142 * dent_key_init - initialize directory entry key.
143 * @c: UBIFS file-system description object
144 * @key: key to initialize
145 * @inum: parent inode number
146 * @nm: direntry name and length
147 */
148static inline void dent_key_init(const struct ubifs_info *c,
149 union ubifs_key *key, ino_t inum,
150 const struct qstr *nm)
151{
152 uint32_t hash = c->key_hash(nm->name, nm->len);
153
154 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
155 key->u32[0] = inum;
156 key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
157}
158
159/**
160 * dent_key_init_hash - initialize directory entry key without re-calculating
161 * hash function.
162 * @c: UBIFS file-system description object
163 * @key: key to initialize
164 * @inum: parent inode number
165 * @hash: direntry name hash
166 */
167static inline void dent_key_init_hash(const struct ubifs_info *c,
168 union ubifs_key *key, ino_t inum,
169 uint32_t hash)
170{
171 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
172 key->u32[0] = inum;
173 key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
174}
175
176/**
177 * dent_key_init_flash - initialize on-flash directory entry key.
178 * @c: UBIFS file-system description object
179 * @k: key to initialize
180 * @inum: parent inode number
181 * @nm: direntry name and length
182 */
183static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
184 ino_t inum, const struct qstr *nm)
185{
186 union ubifs_key *key = k;
187 uint32_t hash = c->key_hash(nm->name, nm->len);
188
189 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
190 key->j32[0] = cpu_to_le32(inum);
191 key->j32[1] = cpu_to_le32(hash |
192 (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
193 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
194}
195
196/**
197 * lowest_dent_key - get the lowest possible directory entry key.
198 * @c: UBIFS file-system description object
199 * @key: where to store the lowest key
200 * @inum: parent inode number
201 */
202static inline void lowest_dent_key(const struct ubifs_info *c,
203 union ubifs_key *key, ino_t inum)
204{
205 key->u32[0] = inum;
206 key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
207}
208
209/**
210 * xent_key_init - initialize extended attribute entry key.
211 * @c: UBIFS file-system description object
212 * @key: key to initialize
213 * @inum: host inode number
214 * @nm: extended attribute entry name and length
215 */
216static inline void xent_key_init(const struct ubifs_info *c,
217 union ubifs_key *key, ino_t inum,
218 const struct qstr *nm)
219{
220 uint32_t hash = c->key_hash(nm->name, nm->len);
221
222 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
223 key->u32[0] = inum;
224 key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
225}
226
227/**
228 * xent_key_init_hash - initialize extended attribute entry key without
229 * re-calculating hash function.
230 * @c: UBIFS file-system description object
231 * @key: key to initialize
232 * @inum: host inode number
233 * @hash: extended attribute entry name hash
234 */
235static inline void xent_key_init_hash(const struct ubifs_info *c,
236 union ubifs_key *key, ino_t inum,
237 uint32_t hash)
238{
239 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
240 key->u32[0] = inum;
241 key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
242}
243
244/**
245 * xent_key_init_flash - initialize on-flash extended attribute entry key.
246 * @c: UBIFS file-system description object
247 * @k: key to initialize
248 * @inum: host inode number
249 * @nm: extended attribute entry name and length
250 */
251static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
252 ino_t inum, const struct qstr *nm)
253{
254 union ubifs_key *key = k;
255 uint32_t hash = c->key_hash(nm->name, nm->len);
256
257 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
258 key->j32[0] = cpu_to_le32(inum);
259 key->j32[1] = cpu_to_le32(hash |
260 (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
261 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
262}
263
264/**
265 * lowest_xent_key - get the lowest possible extended attribute entry key.
266 * @c: UBIFS file-system description object
267 * @key: where to store the lowest key
268 * @inum: host inode number
269 */
270static inline void lowest_xent_key(const struct ubifs_info *c,
271 union ubifs_key *key, ino_t inum)
272{
273 key->u32[0] = inum;
274 key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
275}
276
277/**
278 * data_key_init - initialize data key.
279 * @c: UBIFS file-system description object
280 * @key: key to initialize
281 * @inum: inode number
282 * @block: block number
283 */
284static inline void data_key_init(const struct ubifs_info *c,
285 union ubifs_key *key, ino_t inum,
286 unsigned int block)
287{
288 ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
289 key->u32[0] = inum;
290 key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
291}
292
293/**
294 * data_key_init_flash - initialize on-flash data key.
295 * @c: UBIFS file-system description object
296 * @k: key to initialize
297 * @inum: inode number
298 * @block: block number
299 */
300static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
301 ino_t inum, unsigned int block)
302{
303 union ubifs_key *key = k;
304
305 ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
306 key->j32[0] = cpu_to_le32(inum);
307 key->j32[1] = cpu_to_le32(block |
308 (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
309 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
310}
311
312/**
313 * trun_key_init - initialize truncation node key.
314 * @c: UBIFS file-system description object
315 * @key: key to initialize
316 * @inum: inode number
317 *
318 * Note, UBIFS does not have truncation keys on the media and this function is
319 * only used for purposes of replay.
320 */
321static inline void trun_key_init(const struct ubifs_info *c,
322 union ubifs_key *key, ino_t inum)
323{
324 key->u32[0] = inum;
325 key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
326}
327
328/**
329 * key_type - get key type.
330 * @c: UBIFS file-system description object
331 * @key: key to get type of
332 */
333static inline int key_type(const struct ubifs_info *c,
334 const union ubifs_key *key)
335{
336 return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
337}
338
339/**
340 * key_type_flash - get type of a on-flash formatted key.
341 * @c: UBIFS file-system description object
342 * @k: key to get type of
343 */
344static inline int key_type_flash(const struct ubifs_info *c, const void *k)
345{
346 const union ubifs_key *key = k;
347
348 return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
349}
350
351/**
352 * key_inum - fetch inode number from key.
353 * @c: UBIFS file-system description object
354 * @k: key to fetch inode number from
355 */
356static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
357{
358 const union ubifs_key *key = k;
359
360 return key->u32[0];
361}
362
363/**
364 * key_inum_flash - fetch inode number from an on-flash formatted key.
365 * @c: UBIFS file-system description object
366 * @k: key to fetch inode number from
367 */
368static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
369{
370 const union ubifs_key *key = k;
371
372 return le32_to_cpu(key->j32[0]);
373}
374
375/**
376 * key_hash - get directory entry hash.
377 * @c: UBIFS file-system description object
378 * @key: the key to get hash from
379 */
380static inline int key_hash(const struct ubifs_info *c,
381 const union ubifs_key *key)
382{
383 return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
384}
385
386/**
387 * key_hash_flash - get directory entry hash from an on-flash formatted key.
388 * @c: UBIFS file-system description object
389 * @k: the key to get hash from
390 */
391static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
392{
393 const union ubifs_key *key = k;
394
395 return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
396}
397
398/**
399 * key_block - get data block number.
400 * @c: UBIFS file-system description object
401 * @key: the key to get the block number from
402 */
403static inline unsigned int key_block(const struct ubifs_info *c,
404 const union ubifs_key *key)
405{
406 return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
407}
408
409/**
410 * key_block_flash - get data block number from an on-flash formatted key.
411 * @c: UBIFS file-system description object
412 * @k: the key to get the block number from
413 */
414static inline unsigned int key_block_flash(const struct ubifs_info *c,
415 const void *k)
416{
417 const union ubifs_key *key = k;
418
419 return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
420}
421
422/**
423 * key_read - transform a key to in-memory format.
424 * @c: UBIFS file-system description object
425 * @from: the key to transform
426 * @to: the key to store the result
427 */
428static inline void key_read(const struct ubifs_info *c, const void *from,
429 union ubifs_key *to)
430{
431 const union ubifs_key *f = from;
432
433 to->u32[0] = le32_to_cpu(f->j32[0]);
434 to->u32[1] = le32_to_cpu(f->j32[1]);
435}
436
437/**
438 * key_write - transform a key from in-memory format.
439 * @c: UBIFS file-system description object
440 * @from: the key to transform
441 * @to: the key to store the result
442 */
443static inline void key_write(const struct ubifs_info *c,
444 const union ubifs_key *from, void *to)
445{
446 union ubifs_key *t = to;
447
448 t->j32[0] = cpu_to_le32(from->u32[0]);
449 t->j32[1] = cpu_to_le32(from->u32[1]);
450 memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
451}
452
453/**
454 * key_write_idx - transform a key from in-memory format for the index.
455 * @c: UBIFS file-system description object
456 * @from: the key to transform
457 * @to: the key to store the result
458 */
459static inline void key_write_idx(const struct ubifs_info *c,
460 const union ubifs_key *from, void *to)
461{
462 union ubifs_key *t = to;
463
464 t->j32[0] = cpu_to_le32(from->u32[0]);
465 t->j32[1] = cpu_to_le32(from->u32[1]);
466}
467
468/**
469 * key_copy - copy a key.
470 * @c: UBIFS file-system description object
471 * @from: the key to copy from
472 * @to: the key to copy to
473 */
474static inline void key_copy(const struct ubifs_info *c,
475 const union ubifs_key *from, union ubifs_key *to)
476{
477 to->u64[0] = from->u64[0];
478}
479
480/**
481 * keys_cmp - compare keys.
482 * @c: UBIFS file-system description object
483 * @key1: the first key to compare
484 * @key2: the second key to compare
485 *
486 * This function compares 2 keys and returns %-1 if @key1 is less than
487 * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
488 */
489static inline int keys_cmp(const struct ubifs_info *c,
490 const union ubifs_key *key1,
491 const union ubifs_key *key2)
492{
493 if (key1->u32[0] < key2->u32[0])
494 return -1;
495 if (key1->u32[0] > key2->u32[0])
496 return 1;
497 if (key1->u32[1] < key2->u32[1])
498 return -1;
499 if (key1->u32[1] > key2->u32[1])
500 return 1;
501
502 return 0;
503}
504
505/**
506 * is_hash_key - is a key vulnerable to hash collisions.
507 * @c: UBIFS file-system description object
508 * @key: key
509 *
510 * This function returns %1 if @key is a hashed key or %0 otherwise.
511 */
512static inline int is_hash_key(const struct ubifs_info *c,
513 const union ubifs_key *key)
514{
515 int type = key_type(c, key);
516
517 return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
518}
519
520/**
521 * key_max_inode_size - get maximum file size allowed by current key format.
522 * @c: UBIFS file-system description object
523 */
524static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
525{
526 switch (c->key_fmt) {
527 case UBIFS_SIMPLE_KEY_FMT:
528 return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
529 default:
530 return 0;
531 }
532}
533#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
new file mode 100644
index 000000000000..36857b9ed59e
--- /dev/null
+++ b/fs/ubifs/log.c
@@ -0,0 +1,805 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file is a part of UBIFS journal implementation and contains various
25 * functions which manipulate the log. The log is a fixed area on the flash
26 * which does not contain any data but refers to buds. The log is a part of the
27 * journal.
28 */
29
30#include "ubifs.h"
31
32#ifdef CONFIG_UBIFS_FS_DEBUG
33static int dbg_check_bud_bytes(struct ubifs_info *c);
34#else
35#define dbg_check_bud_bytes(c) 0
36#endif
37
38/**
39 * ubifs_search_bud - search bud LEB.
40 * @c: UBIFS file-system description object
41 * @lnum: logical eraseblock number to search
42 *
43 * This function searches bud LEB @lnum. Returns bud description object in case
44 * of success and %NULL if there is no bud with this LEB number.
45 */
46struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
47{
48 struct rb_node *p;
49 struct ubifs_bud *bud;
50
51 spin_lock(&c->buds_lock);
52 p = c->buds.rb_node;
53 while (p) {
54 bud = rb_entry(p, struct ubifs_bud, rb);
55 if (lnum < bud->lnum)
56 p = p->rb_left;
57 else if (lnum > bud->lnum)
58 p = p->rb_right;
59 else {
60 spin_unlock(&c->buds_lock);
61 return bud;
62 }
63 }
64 spin_unlock(&c->buds_lock);
65 return NULL;
66}
67
68/**
69 * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
70 * @c: UBIFS file-system description object
71 * @lnum: logical eraseblock number to search
72 *
73 * This functions returns the wbuf for @lnum or %NULL if there is not one.
74 */
75struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
76{
77 struct rb_node *p;
78 struct ubifs_bud *bud;
79 int jhead;
80
81 if (!c->jheads)
82 return NULL;
83
84 spin_lock(&c->buds_lock);
85 p = c->buds.rb_node;
86 while (p) {
87 bud = rb_entry(p, struct ubifs_bud, rb);
88 if (lnum < bud->lnum)
89 p = p->rb_left;
90 else if (lnum > bud->lnum)
91 p = p->rb_right;
92 else {
93 jhead = bud->jhead;
94 spin_unlock(&c->buds_lock);
95 return &c->jheads[jhead].wbuf;
96 }
97 }
98 spin_unlock(&c->buds_lock);
99 return NULL;
100}
101
102/**
103 * next_log_lnum - switch to the next log LEB.
104 * @c: UBIFS file-system description object
105 * @lnum: current log LEB
106 */
107static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
108{
109 lnum += 1;
110 if (lnum > c->log_last)
111 lnum = UBIFS_LOG_LNUM;
112
113 return lnum;
114}
115
116/**
117 * empty_log_bytes - calculate amount of empty space in the log.
118 * @c: UBIFS file-system description object
119 */
120static inline long long empty_log_bytes(const struct ubifs_info *c)
121{
122 long long h, t;
123
124 h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
125 t = (long long)c->ltail_lnum * c->leb_size;
126
127 if (h >= t)
128 return c->log_bytes - h + t;
129 else
130 return t - h;
131}
132
133/**
134 * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
135 * @c: UBIFS file-system description object
136 * @bud: the bud to add
137 */
138void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
139{
140 struct rb_node **p, *parent = NULL;
141 struct ubifs_bud *b;
142 struct ubifs_jhead *jhead;
143
144 spin_lock(&c->buds_lock);
145 p = &c->buds.rb_node;
146 while (*p) {
147 parent = *p;
148 b = rb_entry(parent, struct ubifs_bud, rb);
149 ubifs_assert(bud->lnum != b->lnum);
150 if (bud->lnum < b->lnum)
151 p = &(*p)->rb_left;
152 else
153 p = &(*p)->rb_right;
154 }
155
156 rb_link_node(&bud->rb, parent, p);
157 rb_insert_color(&bud->rb, &c->buds);
158 if (c->jheads) {
159 jhead = &c->jheads[bud->jhead];
160 list_add_tail(&bud->list, &jhead->buds_list);
161 } else
162 ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
163
164 /*
165 * Note, although this is a new bud, we anyway account this space now,
166 * before any data has been written to it, because this is about to
167 * guarantee fixed mount time, and this bud will anyway be read and
168 * scanned.
169 */
170 c->bud_bytes += c->leb_size - bud->start;
171
172 dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
173 bud->start, bud->jhead, c->bud_bytes);
174 spin_unlock(&c->buds_lock);
175}
176
177/**
178 * ubifs_create_buds_lists - create journal head buds lists for remount rw.
179 * @c: UBIFS file-system description object
180 */
181void ubifs_create_buds_lists(struct ubifs_info *c)
182{
183 struct rb_node *p;
184
185 spin_lock(&c->buds_lock);
186 p = rb_first(&c->buds);
187 while (p) {
188 struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb);
189 struct ubifs_jhead *jhead = &c->jheads[bud->jhead];
190
191 list_add_tail(&bud->list, &jhead->buds_list);
192 p = rb_next(p);
193 }
194 spin_unlock(&c->buds_lock);
195}
196
197/**
198 * ubifs_add_bud_to_log - add a new bud to the log.
199 * @c: UBIFS file-system description object
200 * @jhead: journal head the bud belongs to
201 * @lnum: LEB number of the bud
202 * @offs: starting offset of the bud
203 *
204 * This function writes reference node for the new bud LEB @lnum it to the log,
205 * and adds it to the buds tress. It also makes sure that log size does not
206 * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
207 * %-EAGAIN if commit is required, and a negative error codes in case of
208 * failure.
209 */
210int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
211{
212 int err;
213 struct ubifs_bud *bud;
214 struct ubifs_ref_node *ref;
215
216 bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
217 if (!bud)
218 return -ENOMEM;
219 ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
220 if (!ref) {
221 kfree(bud);
222 return -ENOMEM;
223 }
224
225 mutex_lock(&c->log_mutex);
226
227 if (c->ro_media) {
228 err = -EROFS;
229 goto out_unlock;
230 }
231
232 /* Make sure we have enough space in the log */
233 if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
234 dbg_log("not enough log space - %lld, required %d",
235 empty_log_bytes(c), c->min_log_bytes);
236 ubifs_commit_required(c);
237 err = -EAGAIN;
238 goto out_unlock;
239 }
240
241 /*
242 * Make sure the the amount of space in buds will not exceed
243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
244 * limits.
245 *
246 * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
247 * because we are holding @c->log_mutex. All @c->bud_bytes take place
248 * when both @c->log_mutex and @c->bud_bytes are locked.
249 */
250 if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
251 dbg_log("bud bytes %lld (%lld max), require commit",
252 c->bud_bytes, c->max_bud_bytes);
253 ubifs_commit_required(c);
254 err = -EAGAIN;
255 goto out_unlock;
256 }
257
258 /*
259 * If the journal is full enough - start background commit. Note, it is
260 * OK to read 'c->cmt_state' without spinlock because integer reads
261 * are atomic in the kernel.
262 */
263 if (c->bud_bytes >= c->bg_bud_bytes &&
264 c->cmt_state == COMMIT_RESTING) {
265 dbg_log("bud bytes %lld (%lld max), initiate BG commit",
266 c->bud_bytes, c->max_bud_bytes);
267 ubifs_request_bg_commit(c);
268 }
269
270 bud->lnum = lnum;
271 bud->start = offs;
272 bud->jhead = jhead;
273
274 ref->ch.node_type = UBIFS_REF_NODE;
275 ref->lnum = cpu_to_le32(bud->lnum);
276 ref->offs = cpu_to_le32(bud->start);
277 ref->jhead = cpu_to_le32(jhead);
278
279 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
280 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
281 c->lhead_offs = 0;
282 }
283
284 if (c->lhead_offs == 0) {
285 /* Must ensure next log LEB has been unmapped */
286 err = ubifs_leb_unmap(c, c->lhead_lnum);
287 if (err)
288 goto out_unlock;
289 }
290
291 if (bud->start == 0) {
292 /*
293 * Before writing the LEB reference which refers an empty LEB
294 * to the log, we have to make sure it is mapped, because
295 * otherwise we'd risk to refer an LEB with garbage in case of
296 * an unclean reboot, because the target LEB might have been
297 * unmapped, but not yet physically erased.
298 */
299 err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
300 if (err)
301 goto out_unlock;
302 }
303
304 dbg_log("write ref LEB %d:%d",
305 c->lhead_lnum, c->lhead_offs);
306 err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
307 c->lhead_offs, UBI_SHORTTERM);
308 if (err)
309 goto out_unlock;
310
311 c->lhead_offs += c->ref_node_alsz;
312
313 ubifs_add_bud(c, bud);
314
315 mutex_unlock(&c->log_mutex);
316 kfree(ref);
317 return 0;
318
319out_unlock:
320 mutex_unlock(&c->log_mutex);
321 kfree(ref);
322 kfree(bud);
323 return err;
324}
325
326/**
327 * remove_buds - remove used buds.
328 * @c: UBIFS file-system description object
329 *
330 * This function removes use buds from the buds tree. It does not remove the
331 * buds which are pointed to by journal heads.
332 */
333static void remove_buds(struct ubifs_info *c)
334{
335 struct rb_node *p;
336
337 ubifs_assert(list_empty(&c->old_buds));
338 c->cmt_bud_bytes = 0;
339 spin_lock(&c->buds_lock);
340 p = rb_first(&c->buds);
341 while (p) {
342 struct rb_node *p1 = p;
343 struct ubifs_bud *bud;
344 struct ubifs_wbuf *wbuf;
345
346 p = rb_next(p);
347 bud = rb_entry(p1, struct ubifs_bud, rb);
348 wbuf = &c->jheads[bud->jhead].wbuf;
349
350 if (wbuf->lnum == bud->lnum) {
351 /*
352 * Do not remove buds which are pointed to by journal
353 * heads (non-closed buds).
354 */
355 c->cmt_bud_bytes += wbuf->offs - bud->start;
356 dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
357 "cmt_bud_bytes %lld", bud->lnum, bud->start,
358 bud->jhead, wbuf->offs - bud->start,
359 c->cmt_bud_bytes);
360 bud->start = wbuf->offs;
361 } else {
362 c->cmt_bud_bytes += c->leb_size - bud->start;
363 dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
364 "cmt_bud_bytes %lld", bud->lnum, bud->start,
365 bud->jhead, c->leb_size - bud->start,
366 c->cmt_bud_bytes);
367 rb_erase(p1, &c->buds);
368 list_del(&bud->list);
369 /*
370 * If the commit does not finish, the recovery will need
371 * to replay the journal, in which case the old buds
372 * must be unchanged. Do not release them until post
373 * commit i.e. do not allow them to be garbage
374 * collected.
375 */
376 list_add(&bud->list, &c->old_buds);
377 }
378 }
379 spin_unlock(&c->buds_lock);
380}
381
382/**
383 * ubifs_log_start_commit - start commit.
384 * @c: UBIFS file-system description object
385 * @ltail_lnum: return new log tail LEB number
386 *
387 * The commit operation starts with writing "commit start" node to the log and
388 * reference nodes for all journal heads which will define new journal after
389 * the commit has been finished. The commit start and reference nodes are
390 * written in one go to the nearest empty log LEB (hence, when commit is
391 * finished UBIFS may safely unmap all the previous log LEBs). This function
392 * returns zero in case of success and a negative error code in case of
393 * failure.
394 */
395int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
396{
397 void *buf;
398 struct ubifs_cs_node *cs;
399 struct ubifs_ref_node *ref;
400 int err, i, max_len, len;
401
402 err = dbg_check_bud_bytes(c);
403 if (err)
404 return err;
405
406 max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
407 max_len = ALIGN(max_len, c->min_io_size);
408 buf = cs = kmalloc(max_len, GFP_NOFS);
409 if (!buf)
410 return -ENOMEM;
411
412 cs->ch.node_type = UBIFS_CS_NODE;
413 cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
414 ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
415
416 /*
417 * Note, we do not lock 'c->log_mutex' because this is the commit start
418 * phase and we are exclusively using the log. And we do not lock
419 * write-buffer because nobody can write to the file-system at this
420 * phase.
421 */
422
423 len = UBIFS_CS_NODE_SZ;
424 for (i = 0; i < c->jhead_cnt; i++) {
425 int lnum = c->jheads[i].wbuf.lnum;
426 int offs = c->jheads[i].wbuf.offs;
427
428 if (lnum == -1 || offs == c->leb_size)
429 continue;
430
431 dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
432 ref = buf + len;
433 ref->ch.node_type = UBIFS_REF_NODE;
434 ref->lnum = cpu_to_le32(lnum);
435 ref->offs = cpu_to_le32(offs);
436 ref->jhead = cpu_to_le32(i);
437
438 ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
439 len += UBIFS_REF_NODE_SZ;
440 }
441
442 ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
443
444 /* Switch to the next log LEB */
445 if (c->lhead_offs) {
446 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
447 c->lhead_offs = 0;
448 }
449
450 if (c->lhead_offs == 0) {
451 /* Must ensure next LEB has been unmapped */
452 err = ubifs_leb_unmap(c, c->lhead_lnum);
453 if (err)
454 goto out;
455 }
456
457 len = ALIGN(len, c->min_io_size);
458 dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
459 err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
460 if (err)
461 goto out;
462
463 *ltail_lnum = c->lhead_lnum;
464
465 c->lhead_offs += len;
466 if (c->lhead_offs == c->leb_size) {
467 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
468 c->lhead_offs = 0;
469 }
470
471 remove_buds(c);
472
473 /*
474 * We have started the commit and now users may use the rest of the log
475 * for new writes.
476 */
477 c->min_log_bytes = 0;
478
479out:
480 kfree(buf);
481 return err;
482}
483
484/**
485 * ubifs_log_end_commit - end commit.
486 * @c: UBIFS file-system description object
487 * @ltail_lnum: new log tail LEB number
488 *
489 * This function is called on when the commit operation was finished. It
490 * moves log tail to new position and unmaps LEBs which contain obsolete data.
491 * Returns zero in case of success and a negative error code in case of
492 * failure.
493 */
494int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
495{
496 int err;
497
498 /*
499 * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
500 * writes during commit. Its only short "commit" start phase when
501 * writers are blocked.
502 */
503 mutex_lock(&c->log_mutex);
504
505 dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
506 c->ltail_lnum, ltail_lnum);
507
508 c->ltail_lnum = ltail_lnum;
509 /*
510 * The commit is finished and from now on it must be guaranteed that
511 * there is always enough space for the next commit.
512 */
513 c->min_log_bytes = c->leb_size;
514
515 spin_lock(&c->buds_lock);
516 c->bud_bytes -= c->cmt_bud_bytes;
517 spin_unlock(&c->buds_lock);
518
519 err = dbg_check_bud_bytes(c);
520
521 mutex_unlock(&c->log_mutex);
522 return err;
523}
524
525/**
526 * ubifs_log_post_commit - things to do after commit is completed.
527 * @c: UBIFS file-system description object
528 * @old_ltail_lnum: old log tail LEB number
529 *
530 * Release buds only after commit is completed, because they must be unchanged
531 * if recovery is needed.
532 *
533 * Unmap log LEBs only after commit is completed, because they may be needed for
534 * recovery.
535 *
536 * This function returns %0 on success and a negative error code on failure.
537 */
538int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
539{
540 int lnum, err = 0;
541
542 while (!list_empty(&c->old_buds)) {
543 struct ubifs_bud *bud;
544
545 bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
546 err = ubifs_return_leb(c, bud->lnum);
547 if (err)
548 return err;
549 list_del(&bud->list);
550 kfree(bud);
551 }
552 mutex_lock(&c->log_mutex);
553 for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
554 lnum = next_log_lnum(c, lnum)) {
555 dbg_log("unmap log LEB %d", lnum);
556 err = ubifs_leb_unmap(c, lnum);
557 if (err)
558 goto out;
559 }
560out:
561 mutex_unlock(&c->log_mutex);
562 return err;
563}
564
565/**
566 * struct done_ref - references that have been done.
567 * @rb: rb-tree node
568 * @lnum: LEB number
569 */
570struct done_ref {
571 struct rb_node rb;
572 int lnum;
573};
574
575/**
576 * done_already - determine if a reference has been done already.
577 * @done_tree: rb-tree to store references that have been done
578 * @lnum: LEB number of reference
579 *
580 * This function returns %1 if the reference has been done, %0 if not, otherwise
581 * a negative error code is returned.
582 */
583static int done_already(struct rb_root *done_tree, int lnum)
584{
585 struct rb_node **p = &done_tree->rb_node, *parent = NULL;
586 struct done_ref *dr;
587
588 while (*p) {
589 parent = *p;
590 dr = rb_entry(parent, struct done_ref, rb);
591 if (lnum < dr->lnum)
592 p = &(*p)->rb_left;
593 else if (lnum > dr->lnum)
594 p = &(*p)->rb_right;
595 else
596 return 1;
597 }
598
599 dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
600 if (!dr)
601 return -ENOMEM;
602
603 dr->lnum = lnum;
604
605 rb_link_node(&dr->rb, parent, p);
606 rb_insert_color(&dr->rb, done_tree);
607
608 return 0;
609}
610
611/**
612 * destroy_done_tree - destroy the done tree.
613 * @done_tree: done tree to destroy
614 */
615static void destroy_done_tree(struct rb_root *done_tree)
616{
617 struct rb_node *this = done_tree->rb_node;
618 struct done_ref *dr;
619
620 while (this) {
621 if (this->rb_left) {
622 this = this->rb_left;
623 continue;
624 } else if (this->rb_right) {
625 this = this->rb_right;
626 continue;
627 }
628 dr = rb_entry(this, struct done_ref, rb);
629 this = rb_parent(this);
630 if (this) {
631 if (this->rb_left == &dr->rb)
632 this->rb_left = NULL;
633 else
634 this->rb_right = NULL;
635 }
636 kfree(dr);
637 }
638}
639
640/**
641 * add_node - add a node to the consolidated log.
642 * @c: UBIFS file-system description object
643 * @buf: buffer to which to add
644 * @lnum: LEB number to which to write is passed and returned here
645 * @offs: offset to where to write is passed and returned here
646 * @node: node to add
647 *
648 * This function returns %0 on success and a negative error code on failure.
649 */
650static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
651 void *node)
652{
653 struct ubifs_ch *ch = node;
654 int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
655
656 if (len > remains) {
657 int sz = ALIGN(*offs, c->min_io_size), err;
658
659 ubifs_pad(c, buf + *offs, sz - *offs);
660 err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
661 if (err)
662 return err;
663 *lnum = next_log_lnum(c, *lnum);
664 *offs = 0;
665 }
666 memcpy(buf + *offs, node, len);
667 *offs += ALIGN(len, 8);
668 return 0;
669}
670
671/**
672 * ubifs_consolidate_log - consolidate the log.
673 * @c: UBIFS file-system description object
674 *
675 * Repeated failed commits could cause the log to be full, but at least 1 LEB is
676 * needed for commit. This function rewrites the reference nodes in the log
677 * omitting duplicates, and failed CS nodes, and leaving no gaps.
678 *
679 * This function returns %0 on success and a negative error code on failure.
680 */
681int ubifs_consolidate_log(struct ubifs_info *c)
682{
683 struct ubifs_scan_leb *sleb;
684 struct ubifs_scan_node *snod;
685 struct rb_root done_tree = RB_ROOT;
686 int lnum, err, first = 1, write_lnum, offs = 0;
687 void *buf;
688
689 dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
690 c->lhead_lnum);
691 buf = vmalloc(c->leb_size);
692 if (!buf)
693 return -ENOMEM;
694 lnum = c->ltail_lnum;
695 write_lnum = lnum;
696 while (1) {
697 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
698 if (IS_ERR(sleb)) {
699 err = PTR_ERR(sleb);
700 goto out_free;
701 }
702 list_for_each_entry(snod, &sleb->nodes, list) {
703 switch (snod->type) {
704 case UBIFS_REF_NODE: {
705 struct ubifs_ref_node *ref = snod->node;
706 int ref_lnum = le32_to_cpu(ref->lnum);
707
708 err = done_already(&done_tree, ref_lnum);
709 if (err < 0)
710 goto out_scan;
711 if (err != 1) {
712 err = add_node(c, buf, &write_lnum,
713 &offs, snod->node);
714 if (err)
715 goto out_scan;
716 }
717 break;
718 }
719 case UBIFS_CS_NODE:
720 if (!first)
721 break;
722 err = add_node(c, buf, &write_lnum, &offs,
723 snod->node);
724 if (err)
725 goto out_scan;
726 first = 0;
727 break;
728 }
729 }
730 ubifs_scan_destroy(sleb);
731 if (lnum == c->lhead_lnum)
732 break;
733 lnum = next_log_lnum(c, lnum);
734 }
735 if (offs) {
736 int sz = ALIGN(offs, c->min_io_size);
737
738 ubifs_pad(c, buf + offs, sz - offs);
739 err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
740 if (err)
741 goto out_free;
742 offs = ALIGN(offs, c->min_io_size);
743 }
744 destroy_done_tree(&done_tree);
745 vfree(buf);
746 if (write_lnum == c->lhead_lnum) {
747 ubifs_err("log is too full");
748 return -EINVAL;
749 }
750 /* Unmap remaining LEBs */
751 lnum = write_lnum;
752 do {
753 lnum = next_log_lnum(c, lnum);
754 err = ubifs_leb_unmap(c, lnum);
755 if (err)
756 return err;
757 } while (lnum != c->lhead_lnum);
758 c->lhead_lnum = write_lnum;
759 c->lhead_offs = offs;
760 dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
761 return 0;
762
763out_scan:
764 ubifs_scan_destroy(sleb);
765out_free:
766 destroy_done_tree(&done_tree);
767 vfree(buf);
768 return err;
769}
770
771#ifdef CONFIG_UBIFS_FS_DEBUG
772
773/**
774 * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
775 * @c: UBIFS file-system description object
776 *
777 * This function makes sure the amount of flash space used by closed buds
778 * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
779 * case of failure.
780 */
781static int dbg_check_bud_bytes(struct ubifs_info *c)
782{
783 int i, err = 0;
784 struct ubifs_bud *bud;
785 long long bud_bytes = 0;
786
787 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
788 return 0;
789
790 spin_lock(&c->buds_lock);
791 for (i = 0; i < c->jhead_cnt; i++)
792 list_for_each_entry(bud, &c->jheads[i].buds_list, list)
793 bud_bytes += c->leb_size - bud->start;
794
795 if (c->bud_bytes != bud_bytes) {
796 ubifs_err("bad bud_bytes %lld, calculated %lld",
797 c->bud_bytes, bud_bytes);
798 err = -EINVAL;
799 }
800 spin_unlock(&c->buds_lock);
801
802 return err;
803}
804
805#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
new file mode 100644
index 000000000000..2ba93da71b65
--- /dev/null
+++ b/fs/ubifs/lprops.c
@@ -0,0 +1,1357 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the functions that access LEB properties and their
25 * categories. LEBs are categorized based on the needs of UBIFS, and the
26 * categories are stored as either heaps or lists to provide a fast way of
27 * finding a LEB in a particular category. For example, UBIFS may need to find
28 * an empty LEB for the journal, or a very dirty LEB for garbage collection.
29 */
30
31#include "ubifs.h"
32
33/**
34 * get_heap_comp_val - get the LEB properties value for heap comparisons.
35 * @lprops: LEB properties
36 * @cat: LEB category
37 */
38static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
39{
40 switch (cat) {
41 case LPROPS_FREE:
42 return lprops->free;
43 case LPROPS_DIRTY_IDX:
44 return lprops->free + lprops->dirty;
45 default:
46 return lprops->dirty;
47 }
48}
49
50/**
51 * move_up_lpt_heap - move a new heap entry up as far as possible.
52 * @c: UBIFS file-system description object
53 * @heap: LEB category heap
54 * @lprops: LEB properties to move
55 * @cat: LEB category
56 *
57 * New entries to a heap are added at the bottom and then moved up until the
58 * parent's value is greater. In the case of LPT's category heaps, the value
59 * is either the amount of free space or the amount of dirty space, depending
60 * on the category.
61 */
62static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
63 struct ubifs_lprops *lprops, int cat)
64{
65 int val1, val2, hpos;
66
67 hpos = lprops->hpos;
68 if (!hpos)
69 return; /* Already top of the heap */
70 val1 = get_heap_comp_val(lprops, cat);
71 /* Compare to parent and, if greater, move up the heap */
72 do {
73 int ppos = (hpos - 1) / 2;
74
75 val2 = get_heap_comp_val(heap->arr[ppos], cat);
76 if (val2 >= val1)
77 return;
78 /* Greater than parent so move up */
79 heap->arr[ppos]->hpos = hpos;
80 heap->arr[hpos] = heap->arr[ppos];
81 heap->arr[ppos] = lprops;
82 lprops->hpos = ppos;
83 hpos = ppos;
84 } while (hpos);
85}
86
87/**
88 * adjust_lpt_heap - move a changed heap entry up or down the heap.
89 * @c: UBIFS file-system description object
90 * @heap: LEB category heap
91 * @lprops: LEB properties to move
92 * @hpos: heap position of @lprops
93 * @cat: LEB category
94 *
95 * Changed entries in a heap are moved up or down until the parent's value is
96 * greater. In the case of LPT's category heaps, the value is either the amount
97 * of free space or the amount of dirty space, depending on the category.
98 */
99static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
100 struct ubifs_lprops *lprops, int hpos, int cat)
101{
102 int val1, val2, val3, cpos;
103
104 val1 = get_heap_comp_val(lprops, cat);
105 /* Compare to parent and, if greater than parent, move up the heap */
106 if (hpos) {
107 int ppos = (hpos - 1) / 2;
108
109 val2 = get_heap_comp_val(heap->arr[ppos], cat);
110 if (val1 > val2) {
111 /* Greater than parent so move up */
112 while (1) {
113 heap->arr[ppos]->hpos = hpos;
114 heap->arr[hpos] = heap->arr[ppos];
115 heap->arr[ppos] = lprops;
116 lprops->hpos = ppos;
117 hpos = ppos;
118 if (!hpos)
119 return;
120 ppos = (hpos - 1) / 2;
121 val2 = get_heap_comp_val(heap->arr[ppos], cat);
122 if (val1 <= val2)
123 return;
124 /* Still greater than parent so keep going */
125 }
126 }
127 }
128 /* Not greater than parent, so compare to children */
129 while (1) {
130 /* Compare to left child */
131 cpos = hpos * 2 + 1;
132 if (cpos >= heap->cnt)
133 return;
134 val2 = get_heap_comp_val(heap->arr[cpos], cat);
135 if (val1 < val2) {
136 /* Less than left child, so promote biggest child */
137 if (cpos + 1 < heap->cnt) {
138 val3 = get_heap_comp_val(heap->arr[cpos + 1],
139 cat);
140 if (val3 > val2)
141 cpos += 1; /* Right child is bigger */
142 }
143 heap->arr[cpos]->hpos = hpos;
144 heap->arr[hpos] = heap->arr[cpos];
145 heap->arr[cpos] = lprops;
146 lprops->hpos = cpos;
147 hpos = cpos;
148 continue;
149 }
150 /* Compare to right child */
151 cpos += 1;
152 if (cpos >= heap->cnt)
153 return;
154 val3 = get_heap_comp_val(heap->arr[cpos], cat);
155 if (val1 < val3) {
156 /* Less than right child, so promote right child */
157 heap->arr[cpos]->hpos = hpos;
158 heap->arr[hpos] = heap->arr[cpos];
159 heap->arr[cpos] = lprops;
160 lprops->hpos = cpos;
161 hpos = cpos;
162 continue;
163 }
164 return;
165 }
166}
167
168/**
169 * add_to_lpt_heap - add LEB properties to a LEB category heap.
170 * @c: UBIFS file-system description object
171 * @lprops: LEB properties to add
172 * @cat: LEB category
173 *
174 * This function returns %1 if @lprops is added to the heap for LEB category
175 * @cat, otherwise %0 is returned because the heap is full.
176 */
177static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
178 int cat)
179{
180 struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
181
182 if (heap->cnt >= heap->max_cnt) {
183 const int b = LPT_HEAP_SZ / 2 - 1;
184 int cpos, val1, val2;
185
186 /* Compare to some other LEB on the bottom of heap */
187 /* Pick a position kind of randomly */
188 cpos = (((size_t)lprops >> 4) & b) + b;
189 ubifs_assert(cpos >= b);
190 ubifs_assert(cpos < LPT_HEAP_SZ);
191 ubifs_assert(cpos < heap->cnt);
192
193 val1 = get_heap_comp_val(lprops, cat);
194 val2 = get_heap_comp_val(heap->arr[cpos], cat);
195 if (val1 > val2) {
196 struct ubifs_lprops *lp;
197
198 lp = heap->arr[cpos];
199 lp->flags &= ~LPROPS_CAT_MASK;
200 lp->flags |= LPROPS_UNCAT;
201 list_add(&lp->list, &c->uncat_list);
202 lprops->hpos = cpos;
203 heap->arr[cpos] = lprops;
204 move_up_lpt_heap(c, heap, lprops, cat);
205 dbg_check_heap(c, heap, cat, lprops->hpos);
206 return 1; /* Added to heap */
207 }
208 dbg_check_heap(c, heap, cat, -1);
209 return 0; /* Not added to heap */
210 } else {
211 lprops->hpos = heap->cnt++;
212 heap->arr[lprops->hpos] = lprops;
213 move_up_lpt_heap(c, heap, lprops, cat);
214 dbg_check_heap(c, heap, cat, lprops->hpos);
215 return 1; /* Added to heap */
216 }
217}
218
219/**
220 * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
221 * @c: UBIFS file-system description object
222 * @lprops: LEB properties to remove
223 * @cat: LEB category
224 */
225static void remove_from_lpt_heap(struct ubifs_info *c,
226 struct ubifs_lprops *lprops, int cat)
227{
228 struct ubifs_lpt_heap *heap;
229 int hpos = lprops->hpos;
230
231 heap = &c->lpt_heap[cat - 1];
232 ubifs_assert(hpos >= 0 && hpos < heap->cnt);
233 ubifs_assert(heap->arr[hpos] == lprops);
234 heap->cnt -= 1;
235 if (hpos < heap->cnt) {
236 heap->arr[hpos] = heap->arr[heap->cnt];
237 heap->arr[hpos]->hpos = hpos;
238 adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
239 }
240 dbg_check_heap(c, heap, cat, -1);
241}
242
243/**
244 * lpt_heap_replace - replace lprops in a category heap.
245 * @c: UBIFS file-system description object
246 * @old_lprops: LEB properties to replace
247 * @new_lprops: LEB properties with which to replace
248 * @cat: LEB category
249 *
250 * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
251 * and the lprops that the pnode contains. When that happens, references in
252 * the category heaps to those lprops must be updated to point to the new
253 * lprops. This function does that.
254 */
255static void lpt_heap_replace(struct ubifs_info *c,
256 struct ubifs_lprops *old_lprops,
257 struct ubifs_lprops *new_lprops, int cat)
258{
259 struct ubifs_lpt_heap *heap;
260 int hpos = new_lprops->hpos;
261
262 heap = &c->lpt_heap[cat - 1];
263 heap->arr[hpos] = new_lprops;
264}
265
266/**
267 * ubifs_add_to_cat - add LEB properties to a category list or heap.
268 * @c: UBIFS file-system description object
269 * @lprops: LEB properties to add
270 * @cat: LEB category to which to add
271 *
272 * LEB properties are categorized to enable fast find operations.
273 */
274void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
275 int cat)
276{
277 switch (cat) {
278 case LPROPS_DIRTY:
279 case LPROPS_DIRTY_IDX:
280 case LPROPS_FREE:
281 if (add_to_lpt_heap(c, lprops, cat))
282 break;
283 /* No more room on heap so make it uncategorized */
284 cat = LPROPS_UNCAT;
285 /* Fall through */
286 case LPROPS_UNCAT:
287 list_add(&lprops->list, &c->uncat_list);
288 break;
289 case LPROPS_EMPTY:
290 list_add(&lprops->list, &c->empty_list);
291 break;
292 case LPROPS_FREEABLE:
293 list_add(&lprops->list, &c->freeable_list);
294 c->freeable_cnt += 1;
295 break;
296 case LPROPS_FRDI_IDX:
297 list_add(&lprops->list, &c->frdi_idx_list);
298 break;
299 default:
300 ubifs_assert(0);
301 }
302 lprops->flags &= ~LPROPS_CAT_MASK;
303 lprops->flags |= cat;
304}
305
306/**
307 * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
308 * @c: UBIFS file-system description object
309 * @lprops: LEB properties to remove
310 * @cat: LEB category from which to remove
311 *
312 * LEB properties are categorized to enable fast find operations.
313 */
314static void ubifs_remove_from_cat(struct ubifs_info *c,
315 struct ubifs_lprops *lprops, int cat)
316{
317 switch (cat) {
318 case LPROPS_DIRTY:
319 case LPROPS_DIRTY_IDX:
320 case LPROPS_FREE:
321 remove_from_lpt_heap(c, lprops, cat);
322 break;
323 case LPROPS_FREEABLE:
324 c->freeable_cnt -= 1;
325 ubifs_assert(c->freeable_cnt >= 0);
326 /* Fall through */
327 case LPROPS_UNCAT:
328 case LPROPS_EMPTY:
329 case LPROPS_FRDI_IDX:
330 ubifs_assert(!list_empty(&lprops->list));
331 list_del(&lprops->list);
332 break;
333 default:
334 ubifs_assert(0);
335 }
336}
337
338/**
339 * ubifs_replace_cat - replace lprops in a category list or heap.
340 * @c: UBIFS file-system description object
341 * @old_lprops: LEB properties to replace
342 * @new_lprops: LEB properties with which to replace
343 *
344 * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
345 * and the lprops that the pnode contains. When that happens, references in
346 * category lists and heaps must be replaced. This function does that.
347 */
348void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
349 struct ubifs_lprops *new_lprops)
350{
351 int cat;
352
353 cat = new_lprops->flags & LPROPS_CAT_MASK;
354 switch (cat) {
355 case LPROPS_DIRTY:
356 case LPROPS_DIRTY_IDX:
357 case LPROPS_FREE:
358 lpt_heap_replace(c, old_lprops, new_lprops, cat);
359 break;
360 case LPROPS_UNCAT:
361 case LPROPS_EMPTY:
362 case LPROPS_FREEABLE:
363 case LPROPS_FRDI_IDX:
364 list_replace(&old_lprops->list, &new_lprops->list);
365 break;
366 default:
367 ubifs_assert(0);
368 }
369}
370
371/**
372 * ubifs_ensure_cat - ensure LEB properties are categorized.
373 * @c: UBIFS file-system description object
374 * @lprops: LEB properties
375 *
376 * A LEB may have fallen off of the bottom of a heap, and ended up as
377 * uncategorized even though it has enough space for us now. If that is the case
378 * this function will put the LEB back onto a heap.
379 */
380void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
381{
382 int cat = lprops->flags & LPROPS_CAT_MASK;
383
384 if (cat != LPROPS_UNCAT)
385 return;
386 cat = ubifs_categorize_lprops(c, lprops);
387 if (cat == LPROPS_UNCAT)
388 return;
389 ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
390 ubifs_add_to_cat(c, lprops, cat);
391}
392
393/**
394 * ubifs_categorize_lprops - categorize LEB properties.
395 * @c: UBIFS file-system description object
396 * @lprops: LEB properties to categorize
397 *
398 * LEB properties are categorized to enable fast find operations. This function
399 * returns the LEB category to which the LEB properties belong. Note however
400 * that if the LEB category is stored as a heap and the heap is full, the
401 * LEB properties may have their category changed to %LPROPS_UNCAT.
402 */
403int ubifs_categorize_lprops(const struct ubifs_info *c,
404 const struct ubifs_lprops *lprops)
405{
406 if (lprops->flags & LPROPS_TAKEN)
407 return LPROPS_UNCAT;
408
409 if (lprops->free == c->leb_size) {
410 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
411 return LPROPS_EMPTY;
412 }
413
414 if (lprops->free + lprops->dirty == c->leb_size) {
415 if (lprops->flags & LPROPS_INDEX)
416 return LPROPS_FRDI_IDX;
417 else
418 return LPROPS_FREEABLE;
419 }
420
421 if (lprops->flags & LPROPS_INDEX) {
422 if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
423 return LPROPS_DIRTY_IDX;
424 } else {
425 if (lprops->dirty >= c->dead_wm &&
426 lprops->dirty > lprops->free)
427 return LPROPS_DIRTY;
428 if (lprops->free > 0)
429 return LPROPS_FREE;
430 }
431
432 return LPROPS_UNCAT;
433}
434
435/**
436 * change_category - change LEB properties category.
437 * @c: UBIFS file-system description object
438 * @lprops: LEB properties to recategorize
439 *
440 * LEB properties are categorized to enable fast find operations. When the LEB
441 * properties change they must be recategorized.
442 */
443static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
444{
445 int old_cat = lprops->flags & LPROPS_CAT_MASK;
446 int new_cat = ubifs_categorize_lprops(c, lprops);
447
448 if (old_cat == new_cat) {
449 struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
450
451 /* lprops on a heap now must be moved up or down */
452 if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
453 return; /* Not on a heap */
454 heap = &c->lpt_heap[new_cat - 1];
455 adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
456 } else {
457 ubifs_remove_from_cat(c, lprops, old_cat);
458 ubifs_add_to_cat(c, lprops, new_cat);
459 }
460}
461
462/**
463 * ubifs_get_lprops - get reference to LEB properties.
464 * @c: the UBIFS file-system description object
465 *
466 * This function locks lprops. Lprops have to be unlocked by
467 * 'ubifs_release_lprops()'.
468 */
469void ubifs_get_lprops(struct ubifs_info *c)
470{
471 mutex_lock(&c->lp_mutex);
472}
473
474/**
475 * calc_dark - calculate LEB dark space size.
476 * @c: the UBIFS file-system description object
477 * @spc: amount of free and dirty space in the LEB
478 *
479 * This function calculates amount of dark space in an LEB which has @spc bytes
480 * of free and dirty space. Returns the calculations result.
481 *
482 * Dark space is the space which is not always usable - it depends on which
483 * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
484 * it is dark space, because it cannot fit a large data node. So UBIFS cannot
485 * count on this LEB and treat these 512 bytes as usable because it is not true
486 * if, for example, only big chunks of uncompressible data will be written to
487 * the FS.
488 */
489static int calc_dark(struct ubifs_info *c, int spc)
490{
491 ubifs_assert(!(spc & 7));
492
493 if (spc < c->dark_wm)
494 return spc;
495
496 /*
497 * If we have slightly more space then the dark space watermark, we can
498 * anyway safely assume it we'll be able to write a node of the
499 * smallest size there.
500 */
501 if (spc - c->dark_wm < MIN_WRITE_SZ)
502 return spc - MIN_WRITE_SZ;
503
504 return c->dark_wm;
505}
506
507/**
508 * is_lprops_dirty - determine if LEB properties are dirty.
509 * @c: the UBIFS file-system description object
510 * @lprops: LEB properties to test
511 */
512static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
513{
514 struct ubifs_pnode *pnode;
515 int pos;
516
517 pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
518 pnode = (struct ubifs_pnode *)container_of(lprops - pos,
519 struct ubifs_pnode,
520 lprops[0]);
521 return !test_bit(COW_ZNODE, &pnode->flags) &&
522 test_bit(DIRTY_CNODE, &pnode->flags);
523}
524
525/**
526 * ubifs_change_lp - change LEB properties.
527 * @c: the UBIFS file-system description object
528 * @lp: LEB properties to change
529 * @free: new free space amount
530 * @dirty: new dirty space amount
531 * @flags: new flags
532 * @idx_gc_cnt: change to the count of idx_gc list
533 *
534 * This function changes LEB properties. This function does not change a LEB
535 * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
536 *
537 * This function returns a pointer to the updated LEB properties on success
538 * and a negative error code on failure. N.B. the LEB properties may have had to
539 * be copied (due to COW) and consequently the pointer returned may not be the
540 * same as the pointer passed.
541 */
542const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
543 const struct ubifs_lprops *lp,
544 int free, int dirty, int flags,
545 int idx_gc_cnt)
546{
547 /*
548 * This is the only function that is allowed to change lprops, so we
549 * discard the const qualifier.
550 */
551 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
552
553 dbg_lp("LEB %d, free %d, dirty %d, flags %d",
554 lprops->lnum, free, dirty, flags);
555
556 ubifs_assert(mutex_is_locked(&c->lp_mutex));
557 ubifs_assert(c->lst.empty_lebs >= 0 &&
558 c->lst.empty_lebs <= c->main_lebs);
559 ubifs_assert(c->freeable_cnt >= 0);
560 ubifs_assert(c->freeable_cnt <= c->main_lebs);
561 ubifs_assert(c->lst.taken_empty_lebs >= 0);
562 ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
563 ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
564 ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
565 ubifs_assert(!(c->lst.total_used & 7));
566 ubifs_assert(free == LPROPS_NC || free >= 0);
567 ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
568
569 if (!is_lprops_dirty(c, lprops)) {
570 lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
571 if (IS_ERR(lprops))
572 return lprops;
573 } else
574 ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
575
576 ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
577
578 spin_lock(&c->space_lock);
579
580 if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
581 c->lst.taken_empty_lebs -= 1;
582
583 if (!(lprops->flags & LPROPS_INDEX)) {
584 int old_spc;
585
586 old_spc = lprops->free + lprops->dirty;
587 if (old_spc < c->dead_wm)
588 c->lst.total_dead -= old_spc;
589 else
590 c->lst.total_dark -= calc_dark(c, old_spc);
591
592 c->lst.total_used -= c->leb_size - old_spc;
593 }
594
595 if (free != LPROPS_NC) {
596 free = ALIGN(free, 8);
597 c->lst.total_free += free - lprops->free;
598
599 /* Increase or decrease empty LEBs counter if needed */
600 if (free == c->leb_size) {
601 if (lprops->free != c->leb_size)
602 c->lst.empty_lebs += 1;
603 } else if (lprops->free == c->leb_size)
604 c->lst.empty_lebs -= 1;
605 lprops->free = free;
606 }
607
608 if (dirty != LPROPS_NC) {
609 dirty = ALIGN(dirty, 8);
610 c->lst.total_dirty += dirty - lprops->dirty;
611 lprops->dirty = dirty;
612 }
613
614 if (flags != LPROPS_NC) {
615 /* Take care about indexing LEBs counter if needed */
616 if ((lprops->flags & LPROPS_INDEX)) {
617 if (!(flags & LPROPS_INDEX))
618 c->lst.idx_lebs -= 1;
619 } else if (flags & LPROPS_INDEX)
620 c->lst.idx_lebs += 1;
621 lprops->flags = flags;
622 }
623
624 if (!(lprops->flags & LPROPS_INDEX)) {
625 int new_spc;
626
627 new_spc = lprops->free + lprops->dirty;
628 if (new_spc < c->dead_wm)
629 c->lst.total_dead += new_spc;
630 else
631 c->lst.total_dark += calc_dark(c, new_spc);
632
633 c->lst.total_used += c->leb_size - new_spc;
634 }
635
636 if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
637 c->lst.taken_empty_lebs += 1;
638
639 change_category(c, lprops);
640
641 c->idx_gc_cnt += idx_gc_cnt;
642
643 spin_unlock(&c->space_lock);
644
645 return lprops;
646}
647
648/**
649 * ubifs_release_lprops - release lprops lock.
650 * @c: the UBIFS file-system description object
651 *
652 * This function has to be called after each 'ubifs_get_lprops()' call to
653 * unlock lprops.
654 */
655void ubifs_release_lprops(struct ubifs_info *c)
656{
657 ubifs_assert(mutex_is_locked(&c->lp_mutex));
658 ubifs_assert(c->lst.empty_lebs >= 0 &&
659 c->lst.empty_lebs <= c->main_lebs);
660
661 mutex_unlock(&c->lp_mutex);
662}
663
664/**
665 * ubifs_get_lp_stats - get lprops statistics.
666 * @c: UBIFS file-system description object
667 * @st: return statistics
668 */
669void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
670{
671 spin_lock(&c->space_lock);
672 memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
673 spin_unlock(&c->space_lock);
674}
675
676/**
677 * ubifs_change_one_lp - change LEB properties.
678 * @c: the UBIFS file-system description object
679 * @lnum: LEB to change properties for
680 * @free: amount of free space
681 * @dirty: amount of dirty space
682 * @flags_set: flags to set
683 * @flags_clean: flags to clean
684 * @idx_gc_cnt: change to the count of idx_gc list
685 *
686 * This function changes properties of LEB @lnum. It is a helper wrapper over
687 * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
688 * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
689 * a negative error code in case of failure.
690 */
691int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
692 int flags_set, int flags_clean, int idx_gc_cnt)
693{
694 int err = 0, flags;
695 const struct ubifs_lprops *lp;
696
697 ubifs_get_lprops(c);
698
699 lp = ubifs_lpt_lookup_dirty(c, lnum);
700 if (IS_ERR(lp)) {
701 err = PTR_ERR(lp);
702 goto out;
703 }
704
705 flags = (lp->flags | flags_set) & ~flags_clean;
706 lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
707 if (IS_ERR(lp))
708 err = PTR_ERR(lp);
709
710out:
711 ubifs_release_lprops(c);
712 return err;
713}
714
715/**
716 * ubifs_update_one_lp - update LEB properties.
717 * @c: the UBIFS file-system description object
718 * @lnum: LEB to change properties for
719 * @free: amount of free space
720 * @dirty: amount of dirty space to add
721 * @flags_set: flags to set
722 * @flags_clean: flags to clean
723 *
724 * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
725 * current dirty space, not substitutes it.
726 */
727int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
728 int flags_set, int flags_clean)
729{
730 int err = 0, flags;
731 const struct ubifs_lprops *lp;
732
733 ubifs_get_lprops(c);
734
735 lp = ubifs_lpt_lookup_dirty(c, lnum);
736 if (IS_ERR(lp)) {
737 err = PTR_ERR(lp);
738 goto out;
739 }
740
741 flags = (lp->flags | flags_set) & ~flags_clean;
742 lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
743 if (IS_ERR(lp))
744 err = PTR_ERR(lp);
745
746out:
747 ubifs_release_lprops(c);
748 return err;
749}
750
751/**
752 * ubifs_read_one_lp - read LEB properties.
753 * @c: the UBIFS file-system description object
754 * @lnum: LEB to read properties for
755 * @lp: where to store read properties
756 *
757 * This helper function reads properties of a LEB @lnum and stores them in @lp.
758 * Returns zero in case of success and a negative error code in case of
759 * failure.
760 */
761int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
762{
763 int err = 0;
764 const struct ubifs_lprops *lpp;
765
766 ubifs_get_lprops(c);
767
768 lpp = ubifs_lpt_lookup(c, lnum);
769 if (IS_ERR(lpp)) {
770 err = PTR_ERR(lpp);
771 goto out;
772 }
773
774 memcpy(lp, lpp, sizeof(struct ubifs_lprops));
775
776out:
777 ubifs_release_lprops(c);
778 return err;
779}
780
781/**
782 * ubifs_fast_find_free - try to find a LEB with free space quickly.
783 * @c: the UBIFS file-system description object
784 *
785 * This function returns LEB properties for a LEB with free space or %NULL if
786 * the function is unable to find a LEB quickly.
787 */
788const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
789{
790 struct ubifs_lprops *lprops;
791 struct ubifs_lpt_heap *heap;
792
793 ubifs_assert(mutex_is_locked(&c->lp_mutex));
794
795 heap = &c->lpt_heap[LPROPS_FREE - 1];
796 if (heap->cnt == 0)
797 return NULL;
798
799 lprops = heap->arr[0];
800 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
801 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
802 return lprops;
803}
804
805/**
806 * ubifs_fast_find_empty - try to find an empty LEB quickly.
807 * @c: the UBIFS file-system description object
808 *
809 * This function returns LEB properties for an empty LEB or %NULL if the
810 * function is unable to find an empty LEB quickly.
811 */
812const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
813{
814 struct ubifs_lprops *lprops;
815
816 ubifs_assert(mutex_is_locked(&c->lp_mutex));
817
818 if (list_empty(&c->empty_list))
819 return NULL;
820
821 lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
822 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
823 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
824 ubifs_assert(lprops->free == c->leb_size);
825 return lprops;
826}
827
828/**
829 * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
830 * @c: the UBIFS file-system description object
831 *
832 * This function returns LEB properties for a freeable LEB or %NULL if the
833 * function is unable to find a freeable LEB quickly.
834 */
835const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
836{
837 struct ubifs_lprops *lprops;
838
839 ubifs_assert(mutex_is_locked(&c->lp_mutex));
840
841 if (list_empty(&c->freeable_list))
842 return NULL;
843
844 lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
845 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
846 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
847 ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
848 ubifs_assert(c->freeable_cnt > 0);
849 return lprops;
850}
851
852/**
853 * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
854 * @c: the UBIFS file-system description object
855 *
856 * This function returns LEB properties for a freeable index LEB or %NULL if the
857 * function is unable to find a freeable index LEB quickly.
858 */
859const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
860{
861 struct ubifs_lprops *lprops;
862
863 ubifs_assert(mutex_is_locked(&c->lp_mutex));
864
865 if (list_empty(&c->frdi_idx_list))
866 return NULL;
867
868 lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
869 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
870 ubifs_assert((lprops->flags & LPROPS_INDEX));
871 ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
872 return lprops;
873}
874
875#ifdef CONFIG_UBIFS_FS_DEBUG
876
877/**
878 * dbg_check_cats - check category heaps and lists.
879 * @c: UBIFS file-system description object
880 *
881 * This function returns %0 on success and a negative error code on failure.
882 */
883int dbg_check_cats(struct ubifs_info *c)
884{
885 struct ubifs_lprops *lprops;
886 struct list_head *pos;
887 int i, cat;
888
889 if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
890 return 0;
891
892 list_for_each_entry(lprops, &c->empty_list, list) {
893 if (lprops->free != c->leb_size) {
894 ubifs_err("non-empty LEB %d on empty list "
895 "(free %d dirty %d flags %d)", lprops->lnum,
896 lprops->free, lprops->dirty, lprops->flags);
897 return -EINVAL;
898 }
899 if (lprops->flags & LPROPS_TAKEN) {
900 ubifs_err("taken LEB %d on empty list "
901 "(free %d dirty %d flags %d)", lprops->lnum,
902 lprops->free, lprops->dirty, lprops->flags);
903 return -EINVAL;
904 }
905 }
906
907 i = 0;
908 list_for_each_entry(lprops, &c->freeable_list, list) {
909 if (lprops->free + lprops->dirty != c->leb_size) {
910 ubifs_err("non-freeable LEB %d on freeable list "
911 "(free %d dirty %d flags %d)", lprops->lnum,
912 lprops->free, lprops->dirty, lprops->flags);
913 return -EINVAL;
914 }
915 if (lprops->flags & LPROPS_TAKEN) {
916 ubifs_err("taken LEB %d on freeable list "
917 "(free %d dirty %d flags %d)", lprops->lnum,
918 lprops->free, lprops->dirty, lprops->flags);
919 return -EINVAL;
920 }
921 i += 1;
922 }
923 if (i != c->freeable_cnt) {
924 ubifs_err("freeable list count %d expected %d", i,
925 c->freeable_cnt);
926 return -EINVAL;
927 }
928
929 i = 0;
930 list_for_each(pos, &c->idx_gc)
931 i += 1;
932 if (i != c->idx_gc_cnt) {
933 ubifs_err("idx_gc list count %d expected %d", i,
934 c->idx_gc_cnt);
935 return -EINVAL;
936 }
937
938 list_for_each_entry(lprops, &c->frdi_idx_list, list) {
939 if (lprops->free + lprops->dirty != c->leb_size) {
940 ubifs_err("non-freeable LEB %d on frdi_idx list "
941 "(free %d dirty %d flags %d)", lprops->lnum,
942 lprops->free, lprops->dirty, lprops->flags);
943 return -EINVAL;
944 }
945 if (lprops->flags & LPROPS_TAKEN) {
946 ubifs_err("taken LEB %d on frdi_idx list "
947 "(free %d dirty %d flags %d)", lprops->lnum,
948 lprops->free, lprops->dirty, lprops->flags);
949 return -EINVAL;
950 }
951 if (!(lprops->flags & LPROPS_INDEX)) {
952 ubifs_err("non-index LEB %d on frdi_idx list "
953 "(free %d dirty %d flags %d)", lprops->lnum,
954 lprops->free, lprops->dirty, lprops->flags);
955 return -EINVAL;
956 }
957 }
958
959 for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
960 struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
961
962 for (i = 0; i < heap->cnt; i++) {
963 lprops = heap->arr[i];
964 if (!lprops) {
965 ubifs_err("null ptr in LPT heap cat %d", cat);
966 return -EINVAL;
967 }
968 if (lprops->hpos != i) {
969 ubifs_err("bad ptr in LPT heap cat %d", cat);
970 return -EINVAL;
971 }
972 if (lprops->flags & LPROPS_TAKEN) {
973 ubifs_err("taken LEB in LPT heap cat %d", cat);
974 return -EINVAL;
975 }
976 }
977 }
978
979 return 0;
980}
981
982void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
983 int add_pos)
984{
985 int i = 0, j, err = 0;
986
987 if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
988 return;
989
990 for (i = 0; i < heap->cnt; i++) {
991 struct ubifs_lprops *lprops = heap->arr[i];
992 struct ubifs_lprops *lp;
993
994 if (i != add_pos)
995 if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
996 err = 1;
997 goto out;
998 }
999 if (lprops->hpos != i) {
1000 err = 2;
1001 goto out;
1002 }
1003 lp = ubifs_lpt_lookup(c, lprops->lnum);
1004 if (IS_ERR(lp)) {
1005 err = 3;
1006 goto out;
1007 }
1008 if (lprops != lp) {
1009 dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
1010 (size_t)lprops, (size_t)lp, lprops->lnum,
1011 lp->lnum);
1012 err = 4;
1013 goto out;
1014 }
1015 for (j = 0; j < i; j++) {
1016 lp = heap->arr[j];
1017 if (lp == lprops) {
1018 err = 5;
1019 goto out;
1020 }
1021 if (lp->lnum == lprops->lnum) {
1022 err = 6;
1023 goto out;
1024 }
1025 }
1026 }
1027out:
1028 if (err) {
1029 dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
1030 dbg_dump_stack();
1031 dbg_dump_heap(c, heap, cat);
1032 }
1033}
1034
1035/**
1036 * struct scan_check_data - data provided to scan callback function.
1037 * @lst: LEB properties statistics
1038 * @err: error code
1039 */
1040struct scan_check_data {
1041 struct ubifs_lp_stats lst;
1042 int err;
1043};
1044
1045/**
1046 * scan_check_cb - scan callback.
1047 * @c: the UBIFS file-system description object
1048 * @lp: LEB properties to scan
1049 * @in_tree: whether the LEB properties are in main memory
1050 * @data: information passed to and from the caller of the scan
1051 *
1052 * This function returns a code that indicates whether the scan should continue
1053 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
1054 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
1055 * (%LPT_SCAN_STOP).
1056 */
1057static int scan_check_cb(struct ubifs_info *c,
1058 const struct ubifs_lprops *lp, int in_tree,
1059 struct scan_check_data *data)
1060{
1061 struct ubifs_scan_leb *sleb;
1062 struct ubifs_scan_node *snod;
1063 struct ubifs_lp_stats *lst = &data->lst;
1064 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
1065
1066 cat = lp->flags & LPROPS_CAT_MASK;
1067 if (cat != LPROPS_UNCAT) {
1068 cat = ubifs_categorize_lprops(c, lp);
1069 if (cat != (lp->flags & LPROPS_CAT_MASK)) {
1070 ubifs_err("bad LEB category %d expected %d",
1071 (lp->flags & LPROPS_CAT_MASK), cat);
1072 goto out;
1073 }
1074 }
1075
1076 /* Check lp is on its category list (if it has one) */
1077 if (in_tree) {
1078 struct list_head *list = NULL;
1079
1080 switch (cat) {
1081 case LPROPS_EMPTY:
1082 list = &c->empty_list;
1083 break;
1084 case LPROPS_FREEABLE:
1085 list = &c->freeable_list;
1086 break;
1087 case LPROPS_FRDI_IDX:
1088 list = &c->frdi_idx_list;
1089 break;
1090 case LPROPS_UNCAT:
1091 list = &c->uncat_list;
1092 break;
1093 }
1094 if (list) {
1095 struct ubifs_lprops *lprops;
1096 int found = 0;
1097
1098 list_for_each_entry(lprops, list, list) {
1099 if (lprops == lp) {
1100 found = 1;
1101 break;
1102 }
1103 }
1104 if (!found) {
1105 ubifs_err("bad LPT list (category %d)", cat);
1106 goto out;
1107 }
1108 }
1109 }
1110
1111 /* Check lp is on its category heap (if it has one) */
1112 if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
1113 struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
1114
1115 if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
1116 lp != heap->arr[lp->hpos]) {
1117 ubifs_err("bad LPT heap (category %d)", cat);
1118 goto out;
1119 }
1120 }
1121
1122 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
1123 if (IS_ERR(sleb)) {
1124 /*
1125 * After an unclean unmount, empty and freeable LEBs
1126 * may contain garbage.
1127 */
1128 if (lp->free == c->leb_size) {
1129 ubifs_err("scan errors were in empty LEB "
1130 "- continuing checking");
1131 lst->empty_lebs += 1;
1132 lst->total_free += c->leb_size;
1133 lst->total_dark += calc_dark(c, c->leb_size);
1134 return LPT_SCAN_CONTINUE;
1135 }
1136
1137 if (lp->free + lp->dirty == c->leb_size &&
1138 !(lp->flags & LPROPS_INDEX)) {
1139 ubifs_err("scan errors were in freeable LEB "
1140 "- continuing checking");
1141 lst->total_free += lp->free;
1142 lst->total_dirty += lp->dirty;
1143 lst->total_dark += calc_dark(c, c->leb_size);
1144 return LPT_SCAN_CONTINUE;
1145 }
1146 data->err = PTR_ERR(sleb);
1147 return LPT_SCAN_STOP;
1148 }
1149
1150 is_idx = -1;
1151 list_for_each_entry(snod, &sleb->nodes, list) {
1152 int found, level = 0;
1153
1154 cond_resched();
1155
1156 if (is_idx == -1)
1157 is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
1158
1159 if (is_idx && snod->type != UBIFS_IDX_NODE) {
1160 ubifs_err("indexing node in data LEB %d:%d",
1161 lnum, snod->offs);
1162 goto out_destroy;
1163 }
1164
1165 if (snod->type == UBIFS_IDX_NODE) {
1166 struct ubifs_idx_node *idx = snod->node;
1167
1168 key_read(c, ubifs_idx_key(c, idx), &snod->key);
1169 level = le16_to_cpu(idx->level);
1170 }
1171
1172 found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
1173 snod->offs, is_idx);
1174 if (found) {
1175 if (found < 0)
1176 goto out_destroy;
1177 used += ALIGN(snod->len, 8);
1178 }
1179 }
1180
1181 free = c->leb_size - sleb->endpt;
1182 dirty = sleb->endpt - used;
1183
1184 if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
1185 dirty < 0) {
1186 ubifs_err("bad calculated accounting for LEB %d: "
1187 "free %d, dirty %d", lnum, free, dirty);
1188 goto out_destroy;
1189 }
1190
1191 if (lp->free + lp->dirty == c->leb_size &&
1192 free + dirty == c->leb_size)
1193 if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
1194 (!is_idx && free == c->leb_size) ||
1195 lp->free == c->leb_size) {
1196 /*
1197 * Empty or freeable LEBs could contain index
1198 * nodes from an uncompleted commit due to an
1199 * unclean unmount. Or they could be empty for
1200 * the same reason. Or it may simply not have been
1201 * unmapped.
1202 */
1203 free = lp->free;
1204 dirty = lp->dirty;
1205 is_idx = 0;
1206 }
1207
1208 if (is_idx && lp->free + lp->dirty == free + dirty &&
1209 lnum != c->ihead_lnum) {
1210 /*
1211 * After an unclean unmount, an index LEB could have a different
1212 * amount of free space than the value recorded by lprops. That
1213 * is because the in-the-gaps method may use free space or
1214 * create free space (as a side-effect of using ubi_leb_change
1215 * and not writing the whole LEB). The incorrect free space
1216 * value is not a problem because the index is only ever
1217 * allocated empty LEBs, so there will never be an attempt to
1218 * write to the free space at the end of an index LEB - except
1219 * by the in-the-gaps method for which it is not a problem.
1220 */
1221 free = lp->free;
1222 dirty = lp->dirty;
1223 }
1224
1225 if (lp->free != free || lp->dirty != dirty)
1226 goto out_print;
1227
1228 if (is_idx && !(lp->flags & LPROPS_INDEX)) {
1229 if (free == c->leb_size)
1230 /* Free but not unmapped LEB, it's fine */
1231 is_idx = 0;
1232 else {
1233 ubifs_err("indexing node without indexing "
1234 "flag");
1235 goto out_print;
1236 }
1237 }
1238
1239 if (!is_idx && (lp->flags & LPROPS_INDEX)) {
1240 ubifs_err("data node with indexing flag");
1241 goto out_print;
1242 }
1243
1244 if (free == c->leb_size)
1245 lst->empty_lebs += 1;
1246
1247 if (is_idx)
1248 lst->idx_lebs += 1;
1249
1250 if (!(lp->flags & LPROPS_INDEX))
1251 lst->total_used += c->leb_size - free - dirty;
1252 lst->total_free += free;
1253 lst->total_dirty += dirty;
1254
1255 if (!(lp->flags & LPROPS_INDEX)) {
1256 int spc = free + dirty;
1257
1258 if (spc < c->dead_wm)
1259 lst->total_dead += spc;
1260 else
1261 lst->total_dark += calc_dark(c, spc);
1262 }
1263
1264 ubifs_scan_destroy(sleb);
1265
1266 return LPT_SCAN_CONTINUE;
1267
1268out_print:
1269 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
1270 "should be free %d, dirty %d",
1271 lnum, lp->free, lp->dirty, lp->flags, free, dirty);
1272 dbg_dump_leb(c, lnum);
1273out_destroy:
1274 ubifs_scan_destroy(sleb);
1275out:
1276 data->err = -EINVAL;
1277 return LPT_SCAN_STOP;
1278}
1279
1280/**
1281 * dbg_check_lprops - check all LEB properties.
1282 * @c: UBIFS file-system description object
1283 *
1284 * This function checks all LEB properties and makes sure they are all correct.
1285 * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
1286 * and other negative error codes in case of other errors. This function is
1287 * called while the file system is locked (because of commit start), so no
1288 * additional locking is required. Note that locking the LPT mutex would cause
1289 * a circular lock dependency with the TNC mutex.
1290 */
1291int dbg_check_lprops(struct ubifs_info *c)
1292{
1293 int i, err;
1294 struct scan_check_data data;
1295 struct ubifs_lp_stats *lst = &data.lst;
1296
1297 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1298 return 0;
1299
1300 /*
1301 * As we are going to scan the media, the write buffers have to be
1302 * synchronized.
1303 */
1304 for (i = 0; i < c->jhead_cnt; i++) {
1305 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
1306 if (err)
1307 return err;
1308 }
1309
1310 memset(lst, 0, sizeof(struct ubifs_lp_stats));
1311
1312 data.err = 0;
1313 err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
1314 (ubifs_lpt_scan_callback)scan_check_cb,
1315 &data);
1316 if (err && err != -ENOSPC)
1317 goto out;
1318 if (data.err) {
1319 err = data.err;
1320 goto out;
1321 }
1322
1323 if (lst->empty_lebs != c->lst.empty_lebs ||
1324 lst->idx_lebs != c->lst.idx_lebs ||
1325 lst->total_free != c->lst.total_free ||
1326 lst->total_dirty != c->lst.total_dirty ||
1327 lst->total_used != c->lst.total_used) {
1328 ubifs_err("bad overall accounting");
1329 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
1330 "total_free %lld, total_dirty %lld, total_used %lld",
1331 lst->empty_lebs, lst->idx_lebs, lst->total_free,
1332 lst->total_dirty, lst->total_used);
1333 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
1334 "total_free %lld, total_dirty %lld, total_used %lld",
1335 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
1336 c->lst.total_dirty, c->lst.total_used);
1337 err = -EINVAL;
1338 goto out;
1339 }
1340
1341 if (lst->total_dead != c->lst.total_dead ||
1342 lst->total_dark != c->lst.total_dark) {
1343 ubifs_err("bad dead/dark space accounting");
1344 ubifs_err("calculated: total_dead %lld, total_dark %lld",
1345 lst->total_dead, lst->total_dark);
1346 ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
1347 c->lst.total_dead, c->lst.total_dark);
1348 err = -EINVAL;
1349 goto out;
1350 }
1351
1352 err = dbg_check_cats(c);
1353out:
1354 return err;
1355}
1356
1357#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
new file mode 100644
index 000000000000..9ff2463177e5
--- /dev/null
+++ b/fs/ubifs/lpt.c
@@ -0,0 +1,2243 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the LEB properties tree (LPT) area. The LPT area
25 * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
26 * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
27 * between the log and the orphan area.
28 *
29 * The LPT area is like a miniature self-contained file system. It is required
30 * that it never runs out of space, is fast to access and update, and scales
31 * logarithmically. The LEB properties tree is implemented as a wandering tree
32 * much like the TNC, and the LPT area has its own garbage collection.
33 *
34 * The LPT has two slightly different forms called the "small model" and the
35 * "big model". The small model is used when the entire LEB properties table
36 * can be written into a single eraseblock. In that case, garbage collection
37 * consists of just writing the whole table, which therefore makes all other
38 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
39 * selected for garbage collection, which consists are marking the nodes in
40 * that LEB as dirty, and then only the dirty nodes are written out. Also, in
41 * the case of the big model, a table of LEB numbers is saved so that the entire
42 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
43 * mounted.
44 */
45
46#include <linux/crc16.h>
47#include "ubifs.h"
48
49/**
50 * do_calc_lpt_geom - calculate sizes for the LPT area.
51 * @c: the UBIFS file-system description object
52 *
53 * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
54 * properties of the flash and whether LPT is "big" (c->big_lpt).
55 */
56static void do_calc_lpt_geom(struct ubifs_info *c)
57{
58 int i, n, bits, per_leb_wastage, max_pnode_cnt;
59 long long sz, tot_wastage;
60
61 n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
62 max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
63
64 c->lpt_hght = 1;
65 n = UBIFS_LPT_FANOUT;
66 while (n < max_pnode_cnt) {
67 c->lpt_hght += 1;
68 n <<= UBIFS_LPT_FANOUT_SHIFT;
69 }
70
71 c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
72
73 n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
74 c->nnode_cnt = n;
75 for (i = 1; i < c->lpt_hght; i++) {
76 n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
77 c->nnode_cnt += n;
78 }
79
80 c->space_bits = fls(c->leb_size) - 3;
81 c->lpt_lnum_bits = fls(c->lpt_lebs);
82 c->lpt_offs_bits = fls(c->leb_size - 1);
83 c->lpt_spc_bits = fls(c->leb_size);
84
85 n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
86 c->pcnt_bits = fls(n - 1);
87
88 c->lnum_bits = fls(c->max_leb_cnt - 1);
89
90 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
91 (c->big_lpt ? c->pcnt_bits : 0) +
92 (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
93 c->pnode_sz = (bits + 7) / 8;
94
95 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
96 (c->big_lpt ? c->pcnt_bits : 0) +
97 (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
98 c->nnode_sz = (bits + 7) / 8;
99
100 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
101 c->lpt_lebs * c->lpt_spc_bits * 2;
102 c->ltab_sz = (bits + 7) / 8;
103
104 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
105 c->lnum_bits * c->lsave_cnt;
106 c->lsave_sz = (bits + 7) / 8;
107
108 /* Calculate the minimum LPT size */
109 c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
110 c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
111 c->lpt_sz += c->ltab_sz;
112 c->lpt_sz += c->lsave_sz;
113
114 /* Add wastage */
115 sz = c->lpt_sz;
116 per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
117 sz += per_leb_wastage;
118 tot_wastage = per_leb_wastage;
119 while (sz > c->leb_size) {
120 sz += per_leb_wastage;
121 sz -= c->leb_size;
122 tot_wastage += per_leb_wastage;
123 }
124 tot_wastage += ALIGN(sz, c->min_io_size) - sz;
125 c->lpt_sz += tot_wastage;
126}
127
128/**
129 * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
130 * @c: the UBIFS file-system description object
131 *
132 * This function returns %0 on success and a negative error code on failure.
133 */
134int ubifs_calc_lpt_geom(struct ubifs_info *c)
135{
136 int lebs_needed;
137 uint64_t sz;
138
139 do_calc_lpt_geom(c);
140
141 /* Verify that lpt_lebs is big enough */
142 sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
143 sz += c->leb_size - 1;
144 do_div(sz, c->leb_size);
145 lebs_needed = sz;
146 if (lebs_needed > c->lpt_lebs) {
147 ubifs_err("too few LPT LEBs");
148 return -EINVAL;
149 }
150
151 /* Verify that ltab fits in a single LEB (since ltab is a single node */
152 if (c->ltab_sz > c->leb_size) {
153 ubifs_err("LPT ltab too big");
154 return -EINVAL;
155 }
156
157 c->check_lpt_free = c->big_lpt;
158
159 return 0;
160}
161
162/**
163 * calc_dflt_lpt_geom - calculate default LPT geometry.
164 * @c: the UBIFS file-system description object
165 * @main_lebs: number of main area LEBs is passed and returned here
166 * @big_lpt: whether the LPT area is "big" is returned here
167 *
168 * The size of the LPT area depends on parameters that themselves are dependent
169 * on the size of the LPT area. This function, successively recalculates the LPT
170 * area geometry until the parameters and resultant geometry are consistent.
171 *
172 * This function returns %0 on success and a negative error code on failure.
173 */
174static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
175 int *big_lpt)
176{
177 int i, lebs_needed;
178 uint64_t sz;
179
180 /* Start by assuming the minimum number of LPT LEBs */
181 c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
182 c->main_lebs = *main_lebs - c->lpt_lebs;
183 if (c->main_lebs <= 0)
184 return -EINVAL;
185
186 /* And assume we will use the small LPT model */
187 c->big_lpt = 0;
188
189 /*
190 * Calculate the geometry based on assumptions above and then see if it
191 * makes sense
192 */
193 do_calc_lpt_geom(c);
194
195 /* Small LPT model must have lpt_sz < leb_size */
196 if (c->lpt_sz > c->leb_size) {
197 /* Nope, so try again using big LPT model */
198 c->big_lpt = 1;
199 do_calc_lpt_geom(c);
200 }
201
202 /* Now check there are enough LPT LEBs */
203 for (i = 0; i < 64 ; i++) {
204 sz = c->lpt_sz * 4; /* Allow 4 times the size */
205 sz += c->leb_size - 1;
206 do_div(sz, c->leb_size);
207 lebs_needed = sz;
208 if (lebs_needed > c->lpt_lebs) {
209 /* Not enough LPT LEBs so try again with more */
210 c->lpt_lebs = lebs_needed;
211 c->main_lebs = *main_lebs - c->lpt_lebs;
212 if (c->main_lebs <= 0)
213 return -EINVAL;
214 do_calc_lpt_geom(c);
215 continue;
216 }
217 if (c->ltab_sz > c->leb_size) {
218 ubifs_err("LPT ltab too big");
219 return -EINVAL;
220 }
221 *main_lebs = c->main_lebs;
222 *big_lpt = c->big_lpt;
223 return 0;
224 }
225 return -EINVAL;
226}
227
228/**
229 * pack_bits - pack bit fields end-to-end.
230 * @addr: address at which to pack (passed and next address returned)
231 * @pos: bit position at which to pack (passed and next position returned)
232 * @val: value to pack
233 * @nrbits: number of bits of value to pack (1-32)
234 */
235static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
236{
237 uint8_t *p = *addr;
238 int b = *pos;
239
240 ubifs_assert(nrbits > 0);
241 ubifs_assert(nrbits <= 32);
242 ubifs_assert(*pos >= 0);
243 ubifs_assert(*pos < 8);
244 ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
245 if (b) {
246 *p |= ((uint8_t)val) << b;
247 nrbits += b;
248 if (nrbits > 8) {
249 *++p = (uint8_t)(val >>= (8 - b));
250 if (nrbits > 16) {
251 *++p = (uint8_t)(val >>= 8);
252 if (nrbits > 24) {
253 *++p = (uint8_t)(val >>= 8);
254 if (nrbits > 32)
255 *++p = (uint8_t)(val >>= 8);
256 }
257 }
258 }
259 } else {
260 *p = (uint8_t)val;
261 if (nrbits > 8) {
262 *++p = (uint8_t)(val >>= 8);
263 if (nrbits > 16) {
264 *++p = (uint8_t)(val >>= 8);
265 if (nrbits > 24)
266 *++p = (uint8_t)(val >>= 8);
267 }
268 }
269 }
270 b = nrbits & 7;
271 if (b == 0)
272 p++;
273 *addr = p;
274 *pos = b;
275}
276
277/**
278 * ubifs_unpack_bits - unpack bit fields.
279 * @addr: address at which to unpack (passed and next address returned)
280 * @pos: bit position at which to unpack (passed and next position returned)
281 * @nrbits: number of bits of value to unpack (1-32)
282 *
283 * This functions returns the value unpacked.
284 */
285uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
286{
287 const int k = 32 - nrbits;
288 uint8_t *p = *addr;
289 int b = *pos;
290 uint32_t val;
291
292 ubifs_assert(nrbits > 0);
293 ubifs_assert(nrbits <= 32);
294 ubifs_assert(*pos >= 0);
295 ubifs_assert(*pos < 8);
296 if (b) {
297 val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
298 ((uint32_t)p[4] << 24);
299 val <<= (8 - b);
300 val |= *p >> b;
301 nrbits += b;
302 } else
303 val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
304 ((uint32_t)p[3] << 24);
305 val <<= k;
306 val >>= k;
307 b = nrbits & 7;
308 p += nrbits / 8;
309 *addr = p;
310 *pos = b;
311 ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
312 return val;
313}
314
315/**
316 * ubifs_pack_pnode - pack all the bit fields of a pnode.
317 * @c: UBIFS file-system description object
318 * @buf: buffer into which to pack
319 * @pnode: pnode to pack
320 */
321void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
322 struct ubifs_pnode *pnode)
323{
324 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
325 int i, pos = 0;
326 uint16_t crc;
327
328 pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
329 if (c->big_lpt)
330 pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
331 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
332 pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
333 c->space_bits);
334 pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
335 c->space_bits);
336 if (pnode->lprops[i].flags & LPROPS_INDEX)
337 pack_bits(&addr, &pos, 1, 1);
338 else
339 pack_bits(&addr, &pos, 0, 1);
340 }
341 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
342 c->pnode_sz - UBIFS_LPT_CRC_BYTES);
343 addr = buf;
344 pos = 0;
345 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
346}
347
348/**
349 * ubifs_pack_nnode - pack all the bit fields of a nnode.
350 * @c: UBIFS file-system description object
351 * @buf: buffer into which to pack
352 * @nnode: nnode to pack
353 */
354void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
355 struct ubifs_nnode *nnode)
356{
357 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
358 int i, pos = 0;
359 uint16_t crc;
360
361 pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
362 if (c->big_lpt)
363 pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
364 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
365 int lnum = nnode->nbranch[i].lnum;
366
367 if (lnum == 0)
368 lnum = c->lpt_last + 1;
369 pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
370 pack_bits(&addr, &pos, nnode->nbranch[i].offs,
371 c->lpt_offs_bits);
372 }
373 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
374 c->nnode_sz - UBIFS_LPT_CRC_BYTES);
375 addr = buf;
376 pos = 0;
377 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
378}
379
380/**
381 * ubifs_pack_ltab - pack the LPT's own lprops table.
382 * @c: UBIFS file-system description object
383 * @buf: buffer into which to pack
384 * @ltab: LPT's own lprops table to pack
385 */
386void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
387 struct ubifs_lpt_lprops *ltab)
388{
389 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
390 int i, pos = 0;
391 uint16_t crc;
392
393 pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
394 for (i = 0; i < c->lpt_lebs; i++) {
395 pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
396 pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
397 }
398 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
399 c->ltab_sz - UBIFS_LPT_CRC_BYTES);
400 addr = buf;
401 pos = 0;
402 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
403}
404
405/**
406 * ubifs_pack_lsave - pack the LPT's save table.
407 * @c: UBIFS file-system description object
408 * @buf: buffer into which to pack
409 * @lsave: LPT's save table to pack
410 */
411void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
412{
413 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
414 int i, pos = 0;
415 uint16_t crc;
416
417 pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
418 for (i = 0; i < c->lsave_cnt; i++)
419 pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
420 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
421 c->lsave_sz - UBIFS_LPT_CRC_BYTES);
422 addr = buf;
423 pos = 0;
424 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
425}
426
427/**
428 * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
429 * @c: UBIFS file-system description object
430 * @lnum: LEB number to which to add dirty space
431 * @dirty: amount of dirty space to add
432 */
433void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
434{
435 if (!dirty || !lnum)
436 return;
437 dbg_lp("LEB %d add %d to %d",
438 lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
439 ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
440 c->ltab[lnum - c->lpt_first].dirty += dirty;
441}
442
443/**
444 * set_ltab - set LPT LEB properties.
445 * @c: UBIFS file-system description object
446 * @lnum: LEB number
447 * @free: amount of free space
448 * @dirty: amount of dirty space
449 */
450static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
451{
452 dbg_lp("LEB %d free %d dirty %d to %d %d",
453 lnum, c->ltab[lnum - c->lpt_first].free,
454 c->ltab[lnum - c->lpt_first].dirty, free, dirty);
455 ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
456 c->ltab[lnum - c->lpt_first].free = free;
457 c->ltab[lnum - c->lpt_first].dirty = dirty;
458}
459
460/**
461 * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
462 * @c: UBIFS file-system description object
463 * @nnode: nnode for which to add dirt
464 */
465void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
466{
467 struct ubifs_nnode *np = nnode->parent;
468
469 if (np)
470 ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
471 c->nnode_sz);
472 else {
473 ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
474 if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
475 c->lpt_drty_flgs |= LTAB_DIRTY;
476 ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
477 }
478 }
479}
480
481/**
482 * add_pnode_dirt - add dirty space to LPT LEB properties.
483 * @c: UBIFS file-system description object
484 * @pnode: pnode for which to add dirt
485 */
486static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
487{
488 ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
489 c->pnode_sz);
490}
491
492/**
493 * calc_nnode_num - calculate nnode number.
494 * @row: the row in the tree (root is zero)
495 * @col: the column in the row (leftmost is zero)
496 *
497 * The nnode number is a number that uniquely identifies a nnode and can be used
498 * easily to traverse the tree from the root to that nnode.
499 *
500 * This function calculates and returns the nnode number for the nnode at @row
501 * and @col.
502 */
503static int calc_nnode_num(int row, int col)
504{
505 int num, bits;
506
507 num = 1;
508 while (row--) {
509 bits = (col & (UBIFS_LPT_FANOUT - 1));
510 col >>= UBIFS_LPT_FANOUT_SHIFT;
511 num <<= UBIFS_LPT_FANOUT_SHIFT;
512 num |= bits;
513 }
514 return num;
515}
516
517/**
518 * calc_nnode_num_from_parent - calculate nnode number.
519 * @c: UBIFS file-system description object
520 * @parent: parent nnode
521 * @iip: index in parent
522 *
523 * The nnode number is a number that uniquely identifies a nnode and can be used
524 * easily to traverse the tree from the root to that nnode.
525 *
526 * This function calculates and returns the nnode number based on the parent's
527 * nnode number and the index in parent.
528 */
529static int calc_nnode_num_from_parent(struct ubifs_info *c,
530 struct ubifs_nnode *parent, int iip)
531{
532 int num, shft;
533
534 if (!parent)
535 return 1;
536 shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
537 num = parent->num ^ (1 << shft);
538 num |= (UBIFS_LPT_FANOUT + iip) << shft;
539 return num;
540}
541
542/**
543 * calc_pnode_num_from_parent - calculate pnode number.
544 * @c: UBIFS file-system description object
545 * @parent: parent nnode
546 * @iip: index in parent
547 *
548 * The pnode number is a number that uniquely identifies a pnode and can be used
549 * easily to traverse the tree from the root to that pnode.
550 *
551 * This function calculates and returns the pnode number based on the parent's
552 * nnode number and the index in parent.
553 */
554static int calc_pnode_num_from_parent(struct ubifs_info *c,
555 struct ubifs_nnode *parent, int iip)
556{
557 int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
558
559 for (i = 0; i < n; i++) {
560 num <<= UBIFS_LPT_FANOUT_SHIFT;
561 num |= pnum & (UBIFS_LPT_FANOUT - 1);
562 pnum >>= UBIFS_LPT_FANOUT_SHIFT;
563 }
564 num <<= UBIFS_LPT_FANOUT_SHIFT;
565 num |= iip;
566 return num;
567}
568
569/**
570 * ubifs_create_dflt_lpt - create default LPT.
571 * @c: UBIFS file-system description object
572 * @main_lebs: number of main area LEBs is passed and returned here
573 * @lpt_first: LEB number of first LPT LEB
574 * @lpt_lebs: number of LEBs for LPT is passed and returned here
575 * @big_lpt: use big LPT model is passed and returned here
576 *
577 * This function returns %0 on success and a negative error code on failure.
578 */
579int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
580 int *lpt_lebs, int *big_lpt)
581{
582 int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
583 int blnum, boffs, bsz, bcnt;
584 struct ubifs_pnode *pnode = NULL;
585 struct ubifs_nnode *nnode = NULL;
586 void *buf = NULL, *p;
587 struct ubifs_lpt_lprops *ltab = NULL;
588 int *lsave = NULL;
589
590 err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
591 if (err)
592 return err;
593 *lpt_lebs = c->lpt_lebs;
594
595 /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
596 c->lpt_first = lpt_first;
597 /* Needed by 'set_ltab()' */
598 c->lpt_last = lpt_first + c->lpt_lebs - 1;
599 /* Needed by 'ubifs_pack_lsave()' */
600 c->main_first = c->leb_cnt - *main_lebs;
601
602 lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
603 pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
604 nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
605 buf = vmalloc(c->leb_size);
606 ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
607 if (!pnode || !nnode || !buf || !ltab || !lsave) {
608 err = -ENOMEM;
609 goto out;
610 }
611
612 ubifs_assert(!c->ltab);
613 c->ltab = ltab; /* Needed by set_ltab */
614
615 /* Initialize LPT's own lprops */
616 for (i = 0; i < c->lpt_lebs; i++) {
617 ltab[i].free = c->leb_size;
618 ltab[i].dirty = 0;
619 ltab[i].tgc = 0;
620 ltab[i].cmt = 0;
621 }
622
623 lnum = lpt_first;
624 p = buf;
625 /* Number of leaf nodes (pnodes) */
626 cnt = c->pnode_cnt;
627
628 /*
629 * The first pnode contains the LEB properties for the LEBs that contain
630 * the root inode node and the root index node of the index tree.
631 */
632 node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
633 iopos = ALIGN(node_sz, c->min_io_size);
634 pnode->lprops[0].free = c->leb_size - iopos;
635 pnode->lprops[0].dirty = iopos - node_sz;
636 pnode->lprops[0].flags = LPROPS_INDEX;
637
638 node_sz = UBIFS_INO_NODE_SZ;
639 iopos = ALIGN(node_sz, c->min_io_size);
640 pnode->lprops[1].free = c->leb_size - iopos;
641 pnode->lprops[1].dirty = iopos - node_sz;
642
643 for (i = 2; i < UBIFS_LPT_FANOUT; i++)
644 pnode->lprops[i].free = c->leb_size;
645
646 /* Add first pnode */
647 ubifs_pack_pnode(c, p, pnode);
648 p += c->pnode_sz;
649 len = c->pnode_sz;
650 pnode->num += 1;
651
652 /* Reset pnode values for remaining pnodes */
653 pnode->lprops[0].free = c->leb_size;
654 pnode->lprops[0].dirty = 0;
655 pnode->lprops[0].flags = 0;
656
657 pnode->lprops[1].free = c->leb_size;
658 pnode->lprops[1].dirty = 0;
659
660 /*
661 * To calculate the internal node branches, we keep information about
662 * the level below.
663 */
664 blnum = lnum; /* LEB number of level below */
665 boffs = 0; /* Offset of level below */
666 bcnt = cnt; /* Number of nodes in level below */
667 bsz = c->pnode_sz; /* Size of nodes in level below */
668
669 /* Add all remaining pnodes */
670 for (i = 1; i < cnt; i++) {
671 if (len + c->pnode_sz > c->leb_size) {
672 alen = ALIGN(len, c->min_io_size);
673 set_ltab(c, lnum, c->leb_size - alen, alen - len);
674 memset(p, 0xff, alen - len);
675 err = ubi_leb_change(c->ubi, lnum++, buf, alen,
676 UBI_SHORTTERM);
677 if (err)
678 goto out;
679 p = buf;
680 len = 0;
681 }
682 ubifs_pack_pnode(c, p, pnode);
683 p += c->pnode_sz;
684 len += c->pnode_sz;
685 /*
686 * pnodes are simply numbered left to right starting at zero,
687 * which means the pnode number can be used easily to traverse
688 * down the tree to the corresponding pnode.
689 */
690 pnode->num += 1;
691 }
692
693 row = 0;
694 for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
695 row += 1;
696 /* Add all nnodes, one level at a time */
697 while (1) {
698 /* Number of internal nodes (nnodes) at next level */
699 cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
700 for (i = 0; i < cnt; i++) {
701 if (len + c->nnode_sz > c->leb_size) {
702 alen = ALIGN(len, c->min_io_size);
703 set_ltab(c, lnum, c->leb_size - alen,
704 alen - len);
705 memset(p, 0xff, alen - len);
706 err = ubi_leb_change(c->ubi, lnum++, buf, alen,
707 UBI_SHORTTERM);
708 if (err)
709 goto out;
710 p = buf;
711 len = 0;
712 }
713 /* Only 1 nnode at this level, so it is the root */
714 if (cnt == 1) {
715 c->lpt_lnum = lnum;
716 c->lpt_offs = len;
717 }
718 /* Set branches to the level below */
719 for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
720 if (bcnt) {
721 if (boffs + bsz > c->leb_size) {
722 blnum += 1;
723 boffs = 0;
724 }
725 nnode->nbranch[j].lnum = blnum;
726 nnode->nbranch[j].offs = boffs;
727 boffs += bsz;
728 bcnt--;
729 } else {
730 nnode->nbranch[j].lnum = 0;
731 nnode->nbranch[j].offs = 0;
732 }
733 }
734 nnode->num = calc_nnode_num(row, i);
735 ubifs_pack_nnode(c, p, nnode);
736 p += c->nnode_sz;
737 len += c->nnode_sz;
738 }
739 /* Only 1 nnode at this level, so it is the root */
740 if (cnt == 1)
741 break;
742 /* Update the information about the level below */
743 bcnt = cnt;
744 bsz = c->nnode_sz;
745 row -= 1;
746 }
747
748 if (*big_lpt) {
749 /* Need to add LPT's save table */
750 if (len + c->lsave_sz > c->leb_size) {
751 alen = ALIGN(len, c->min_io_size);
752 set_ltab(c, lnum, c->leb_size - alen, alen - len);
753 memset(p, 0xff, alen - len);
754 err = ubi_leb_change(c->ubi, lnum++, buf, alen,
755 UBI_SHORTTERM);
756 if (err)
757 goto out;
758 p = buf;
759 len = 0;
760 }
761
762 c->lsave_lnum = lnum;
763 c->lsave_offs = len;
764
765 for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
766 lsave[i] = c->main_first + i;
767 for (; i < c->lsave_cnt; i++)
768 lsave[i] = c->main_first;
769
770 ubifs_pack_lsave(c, p, lsave);
771 p += c->lsave_sz;
772 len += c->lsave_sz;
773 }
774
775 /* Need to add LPT's own LEB properties table */
776 if (len + c->ltab_sz > c->leb_size) {
777 alen = ALIGN(len, c->min_io_size);
778 set_ltab(c, lnum, c->leb_size - alen, alen - len);
779 memset(p, 0xff, alen - len);
780 err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
781 if (err)
782 goto out;
783 p = buf;
784 len = 0;
785 }
786
787 c->ltab_lnum = lnum;
788 c->ltab_offs = len;
789
790 /* Update ltab before packing it */
791 len += c->ltab_sz;
792 alen = ALIGN(len, c->min_io_size);
793 set_ltab(c, lnum, c->leb_size - alen, alen - len);
794
795 ubifs_pack_ltab(c, p, ltab);
796 p += c->ltab_sz;
797
798 /* Write remaining buffer */
799 memset(p, 0xff, alen - len);
800 err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
801 if (err)
802 goto out;
803
804 c->nhead_lnum = lnum;
805 c->nhead_offs = ALIGN(len, c->min_io_size);
806
807 dbg_lp("space_bits %d", c->space_bits);
808 dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
809 dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
810 dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
811 dbg_lp("pcnt_bits %d", c->pcnt_bits);
812 dbg_lp("lnum_bits %d", c->lnum_bits);
813 dbg_lp("pnode_sz %d", c->pnode_sz);
814 dbg_lp("nnode_sz %d", c->nnode_sz);
815 dbg_lp("ltab_sz %d", c->ltab_sz);
816 dbg_lp("lsave_sz %d", c->lsave_sz);
817 dbg_lp("lsave_cnt %d", c->lsave_cnt);
818 dbg_lp("lpt_hght %d", c->lpt_hght);
819 dbg_lp("big_lpt %d", c->big_lpt);
820 dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
821 dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
822 dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
823 if (c->big_lpt)
824 dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
825out:
826 c->ltab = NULL;
827 kfree(lsave);
828 vfree(ltab);
829 vfree(buf);
830 kfree(nnode);
831 kfree(pnode);
832 return err;
833}
834
835/**
836 * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
837 * @c: UBIFS file-system description object
838 * @pnode: pnode
839 *
840 * When a pnode is loaded into memory, the LEB properties it contains are added,
841 * by this function, to the LEB category lists and heaps.
842 */
843static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
844{
845 int i;
846
847 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
848 int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
849 int lnum = pnode->lprops[i].lnum;
850
851 if (!lnum)
852 return;
853 ubifs_add_to_cat(c, &pnode->lprops[i], cat);
854 }
855}
856
857/**
858 * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
859 * @c: UBIFS file-system description object
860 * @old_pnode: pnode copied
861 * @new_pnode: pnode copy
862 *
863 * During commit it is sometimes necessary to copy a pnode
864 * (see dirty_cow_pnode). When that happens, references in
865 * category lists and heaps must be replaced. This function does that.
866 */
867static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
868 struct ubifs_pnode *new_pnode)
869{
870 int i;
871
872 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
873 if (!new_pnode->lprops[i].lnum)
874 return;
875 ubifs_replace_cat(c, &old_pnode->lprops[i],
876 &new_pnode->lprops[i]);
877 }
878}
879
880/**
881 * check_lpt_crc - check LPT node crc is correct.
882 * @c: UBIFS file-system description object
883 * @buf: buffer containing node
884 * @len: length of node
885 *
886 * This function returns %0 on success and a negative error code on failure.
887 */
888static int check_lpt_crc(void *buf, int len)
889{
890 int pos = 0;
891 uint8_t *addr = buf;
892 uint16_t crc, calc_crc;
893
894 crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
895 calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
896 len - UBIFS_LPT_CRC_BYTES);
897 if (crc != calc_crc) {
898 ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
899 calc_crc);
900 dbg_dump_stack();
901 return -EINVAL;
902 }
903 return 0;
904}
905
906/**
907 * check_lpt_type - check LPT node type is correct.
908 * @c: UBIFS file-system description object
909 * @addr: address of type bit field is passed and returned updated here
910 * @pos: position of type bit field is passed and returned updated here
911 * @type: expected type
912 *
913 * This function returns %0 on success and a negative error code on failure.
914 */
915static int check_lpt_type(uint8_t **addr, int *pos, int type)
916{
917 int node_type;
918
919 node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
920 if (node_type != type) {
921 ubifs_err("invalid type (%d) in LPT node type %d", node_type,
922 type);
923 dbg_dump_stack();
924 return -EINVAL;
925 }
926 return 0;
927}
928
929/**
930 * unpack_pnode - unpack a pnode.
931 * @c: UBIFS file-system description object
932 * @buf: buffer containing packed pnode to unpack
933 * @pnode: pnode structure to fill
934 *
935 * This function returns %0 on success and a negative error code on failure.
936 */
937static int unpack_pnode(struct ubifs_info *c, void *buf,
938 struct ubifs_pnode *pnode)
939{
940 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
941 int i, pos = 0, err;
942
943 err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
944 if (err)
945 return err;
946 if (c->big_lpt)
947 pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
948 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
949 struct ubifs_lprops * const lprops = &pnode->lprops[i];
950
951 lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
952 lprops->free <<= 3;
953 lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
954 lprops->dirty <<= 3;
955
956 if (ubifs_unpack_bits(&addr, &pos, 1))
957 lprops->flags = LPROPS_INDEX;
958 else
959 lprops->flags = 0;
960 lprops->flags |= ubifs_categorize_lprops(c, lprops);
961 }
962 err = check_lpt_crc(buf, c->pnode_sz);
963 return err;
964}
965
966/**
967 * unpack_nnode - unpack a nnode.
968 * @c: UBIFS file-system description object
969 * @buf: buffer containing packed nnode to unpack
970 * @nnode: nnode structure to fill
971 *
972 * This function returns %0 on success and a negative error code on failure.
973 */
974static int unpack_nnode(struct ubifs_info *c, void *buf,
975 struct ubifs_nnode *nnode)
976{
977 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
978 int i, pos = 0, err;
979
980 err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
981 if (err)
982 return err;
983 if (c->big_lpt)
984 nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
985 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
986 int lnum;
987
988 lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
989 c->lpt_first;
990 if (lnum == c->lpt_last + 1)
991 lnum = 0;
992 nnode->nbranch[i].lnum = lnum;
993 nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
994 c->lpt_offs_bits);
995 }
996 err = check_lpt_crc(buf, c->nnode_sz);
997 return err;
998}
999
1000/**
1001 * unpack_ltab - unpack the LPT's own lprops table.
1002 * @c: UBIFS file-system description object
1003 * @buf: buffer from which to unpack
1004 *
1005 * This function returns %0 on success and a negative error code on failure.
1006 */
1007static int unpack_ltab(struct ubifs_info *c, void *buf)
1008{
1009 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1010 int i, pos = 0, err;
1011
1012 err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
1013 if (err)
1014 return err;
1015 for (i = 0; i < c->lpt_lebs; i++) {
1016 int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
1017 int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
1018
1019 if (free < 0 || free > c->leb_size || dirty < 0 ||
1020 dirty > c->leb_size || free + dirty > c->leb_size)
1021 return -EINVAL;
1022
1023 c->ltab[i].free = free;
1024 c->ltab[i].dirty = dirty;
1025 c->ltab[i].tgc = 0;
1026 c->ltab[i].cmt = 0;
1027 }
1028 err = check_lpt_crc(buf, c->ltab_sz);
1029 return err;
1030}
1031
1032/**
1033 * unpack_lsave - unpack the LPT's save table.
1034 * @c: UBIFS file-system description object
1035 * @buf: buffer from which to unpack
1036 *
1037 * This function returns %0 on success and a negative error code on failure.
1038 */
1039static int unpack_lsave(struct ubifs_info *c, void *buf)
1040{
1041 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1042 int i, pos = 0, err;
1043
1044 err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
1045 if (err)
1046 return err;
1047 for (i = 0; i < c->lsave_cnt; i++) {
1048 int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
1049
1050 if (lnum < c->main_first || lnum >= c->leb_cnt)
1051 return -EINVAL;
1052 c->lsave[i] = lnum;
1053 }
1054 err = check_lpt_crc(buf, c->lsave_sz);
1055 return err;
1056}
1057
1058/**
1059 * validate_nnode - validate a nnode.
1060 * @c: UBIFS file-system description object
1061 * @nnode: nnode to validate
1062 * @parent: parent nnode (or NULL for the root nnode)
1063 * @iip: index in parent
1064 *
1065 * This function returns %0 on success and a negative error code on failure.
1066 */
1067static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
1068 struct ubifs_nnode *parent, int iip)
1069{
1070 int i, lvl, max_offs;
1071
1072 if (c->big_lpt) {
1073 int num = calc_nnode_num_from_parent(c, parent, iip);
1074
1075 if (nnode->num != num)
1076 return -EINVAL;
1077 }
1078 lvl = parent ? parent->level - 1 : c->lpt_hght;
1079 if (lvl < 1)
1080 return -EINVAL;
1081 if (lvl == 1)
1082 max_offs = c->leb_size - c->pnode_sz;
1083 else
1084 max_offs = c->leb_size - c->nnode_sz;
1085 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1086 int lnum = nnode->nbranch[i].lnum;
1087 int offs = nnode->nbranch[i].offs;
1088
1089 if (lnum == 0) {
1090 if (offs != 0)
1091 return -EINVAL;
1092 continue;
1093 }
1094 if (lnum < c->lpt_first || lnum > c->lpt_last)
1095 return -EINVAL;
1096 if (offs < 0 || offs > max_offs)
1097 return -EINVAL;
1098 }
1099 return 0;
1100}
1101
1102/**
1103 * validate_pnode - validate a pnode.
1104 * @c: UBIFS file-system description object
1105 * @pnode: pnode to validate
1106 * @parent: parent nnode
1107 * @iip: index in parent
1108 *
1109 * This function returns %0 on success and a negative error code on failure.
1110 */
1111static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
1112 struct ubifs_nnode *parent, int iip)
1113{
1114 int i;
1115
1116 if (c->big_lpt) {
1117 int num = calc_pnode_num_from_parent(c, parent, iip);
1118
1119 if (pnode->num != num)
1120 return -EINVAL;
1121 }
1122 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1123 int free = pnode->lprops[i].free;
1124 int dirty = pnode->lprops[i].dirty;
1125
1126 if (free < 0 || free > c->leb_size || free % c->min_io_size ||
1127 (free & 7))
1128 return -EINVAL;
1129 if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
1130 return -EINVAL;
1131 if (dirty + free > c->leb_size)
1132 return -EINVAL;
1133 }
1134 return 0;
1135}
1136
1137/**
1138 * set_pnode_lnum - set LEB numbers on a pnode.
1139 * @c: UBIFS file-system description object
1140 * @pnode: pnode to update
1141 *
1142 * This function calculates the LEB numbers for the LEB properties it contains
1143 * based on the pnode number.
1144 */
1145static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
1146{
1147 int i, lnum;
1148
1149 lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
1150 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1151 if (lnum >= c->leb_cnt)
1152 return;
1153 pnode->lprops[i].lnum = lnum++;
1154 }
1155}
1156
1157/**
1158 * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
1159 * @c: UBIFS file-system description object
1160 * @parent: parent nnode (or NULL for the root)
1161 * @iip: index in parent
1162 *
1163 * This function returns %0 on success and a negative error code on failure.
1164 */
1165int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
1166{
1167 struct ubifs_nbranch *branch = NULL;
1168 struct ubifs_nnode *nnode = NULL;
1169 void *buf = c->lpt_nod_buf;
1170 int err, lnum, offs;
1171
1172 if (parent) {
1173 branch = &parent->nbranch[iip];
1174 lnum = branch->lnum;
1175 offs = branch->offs;
1176 } else {
1177 lnum = c->lpt_lnum;
1178 offs = c->lpt_offs;
1179 }
1180 nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
1181 if (!nnode) {
1182 err = -ENOMEM;
1183 goto out;
1184 }
1185 if (lnum == 0) {
1186 /*
1187 * This nnode was not written which just means that the LEB
1188 * properties in the subtree below it describe empty LEBs. We
1189 * make the nnode as though we had read it, which in fact means
1190 * doing almost nothing.
1191 */
1192 if (c->big_lpt)
1193 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1194 } else {
1195 err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
1196 if (err)
1197 goto out;
1198 err = unpack_nnode(c, buf, nnode);
1199 if (err)
1200 goto out;
1201 }
1202 err = validate_nnode(c, nnode, parent, iip);
1203 if (err)
1204 goto out;
1205 if (!c->big_lpt)
1206 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1207 if (parent) {
1208 branch->nnode = nnode;
1209 nnode->level = parent->level - 1;
1210 } else {
1211 c->nroot = nnode;
1212 nnode->level = c->lpt_hght;
1213 }
1214 nnode->parent = parent;
1215 nnode->iip = iip;
1216 return 0;
1217
1218out:
1219 ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
1220 kfree(nnode);
1221 return err;
1222}
1223
1224/**
1225 * read_pnode - read a pnode from flash and link it to the tree in memory.
1226 * @c: UBIFS file-system description object
1227 * @parent: parent nnode
1228 * @iip: index in parent
1229 *
1230 * This function returns %0 on success and a negative error code on failure.
1231 */
1232static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
1233{
1234 struct ubifs_nbranch *branch;
1235 struct ubifs_pnode *pnode = NULL;
1236 void *buf = c->lpt_nod_buf;
1237 int err, lnum, offs;
1238
1239 branch = &parent->nbranch[iip];
1240 lnum = branch->lnum;
1241 offs = branch->offs;
1242 pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
1243 if (!pnode) {
1244 err = -ENOMEM;
1245 goto out;
1246 }
1247 if (lnum == 0) {
1248 /*
1249 * This pnode was not written which just means that the LEB
1250 * properties in it describe empty LEBs. We make the pnode as
1251 * though we had read it.
1252 */
1253 int i;
1254
1255 if (c->big_lpt)
1256 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1257 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1258 struct ubifs_lprops * const lprops = &pnode->lprops[i];
1259
1260 lprops->free = c->leb_size;
1261 lprops->flags = ubifs_categorize_lprops(c, lprops);
1262 }
1263 } else {
1264 err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
1265 if (err)
1266 goto out;
1267 err = unpack_pnode(c, buf, pnode);
1268 if (err)
1269 goto out;
1270 }
1271 err = validate_pnode(c, pnode, parent, iip);
1272 if (err)
1273 goto out;
1274 if (!c->big_lpt)
1275 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1276 branch->pnode = pnode;
1277 pnode->parent = parent;
1278 pnode->iip = iip;
1279 set_pnode_lnum(c, pnode);
1280 c->pnodes_have += 1;
1281 return 0;
1282
1283out:
1284 ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
1285 dbg_dump_pnode(c, pnode, parent, iip);
1286 dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
1287 kfree(pnode);
1288 return err;
1289}
1290
1291/**
1292 * read_ltab - read LPT's own lprops table.
1293 * @c: UBIFS file-system description object
1294 *
1295 * This function returns %0 on success and a negative error code on failure.
1296 */
1297static int read_ltab(struct ubifs_info *c)
1298{
1299 int err;
1300 void *buf;
1301
1302 buf = vmalloc(c->ltab_sz);
1303 if (!buf)
1304 return -ENOMEM;
1305 err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
1306 if (err)
1307 goto out;
1308 err = unpack_ltab(c, buf);
1309out:
1310 vfree(buf);
1311 return err;
1312}
1313
1314/**
1315 * read_lsave - read LPT's save table.
1316 * @c: UBIFS file-system description object
1317 *
1318 * This function returns %0 on success and a negative error code on failure.
1319 */
1320static int read_lsave(struct ubifs_info *c)
1321{
1322 int err, i;
1323 void *buf;
1324
1325 buf = vmalloc(c->lsave_sz);
1326 if (!buf)
1327 return -ENOMEM;
1328 err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
1329 if (err)
1330 goto out;
1331 err = unpack_lsave(c, buf);
1332 if (err)
1333 goto out;
1334 for (i = 0; i < c->lsave_cnt; i++) {
1335 int lnum = c->lsave[i];
1336
1337 /*
1338 * Due to automatic resizing, the values in the lsave table
1339 * could be beyond the volume size - just ignore them.
1340 */
1341 if (lnum >= c->leb_cnt)
1342 continue;
1343 ubifs_lpt_lookup(c, lnum);
1344 }
1345out:
1346 vfree(buf);
1347 return err;
1348}
1349
1350/**
1351 * ubifs_get_nnode - get a nnode.
1352 * @c: UBIFS file-system description object
1353 * @parent: parent nnode (or NULL for the root)
1354 * @iip: index in parent
1355 *
1356 * This function returns a pointer to the nnode on success or a negative error
1357 * code on failure.
1358 */
1359struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
1360 struct ubifs_nnode *parent, int iip)
1361{
1362 struct ubifs_nbranch *branch;
1363 struct ubifs_nnode *nnode;
1364 int err;
1365
1366 branch = &parent->nbranch[iip];
1367 nnode = branch->nnode;
1368 if (nnode)
1369 return nnode;
1370 err = ubifs_read_nnode(c, parent, iip);
1371 if (err)
1372 return ERR_PTR(err);
1373 return branch->nnode;
1374}
1375
1376/**
1377 * ubifs_get_pnode - get a pnode.
1378 * @c: UBIFS file-system description object
1379 * @parent: parent nnode
1380 * @iip: index in parent
1381 *
1382 * This function returns a pointer to the pnode on success or a negative error
1383 * code on failure.
1384 */
1385struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
1386 struct ubifs_nnode *parent, int iip)
1387{
1388 struct ubifs_nbranch *branch;
1389 struct ubifs_pnode *pnode;
1390 int err;
1391
1392 branch = &parent->nbranch[iip];
1393 pnode = branch->pnode;
1394 if (pnode)
1395 return pnode;
1396 err = read_pnode(c, parent, iip);
1397 if (err)
1398 return ERR_PTR(err);
1399 update_cats(c, branch->pnode);
1400 return branch->pnode;
1401}
1402
1403/**
1404 * ubifs_lpt_lookup - lookup LEB properties in the LPT.
1405 * @c: UBIFS file-system description object
1406 * @lnum: LEB number to lookup
1407 *
1408 * This function returns a pointer to the LEB properties on success or a
1409 * negative error code on failure.
1410 */
1411struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
1412{
1413 int err, i, h, iip, shft;
1414 struct ubifs_nnode *nnode;
1415 struct ubifs_pnode *pnode;
1416
1417 if (!c->nroot) {
1418 err = ubifs_read_nnode(c, NULL, 0);
1419 if (err)
1420 return ERR_PTR(err);
1421 }
1422 nnode = c->nroot;
1423 i = lnum - c->main_first;
1424 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1425 for (h = 1; h < c->lpt_hght; h++) {
1426 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1427 shft -= UBIFS_LPT_FANOUT_SHIFT;
1428 nnode = ubifs_get_nnode(c, nnode, iip);
1429 if (IS_ERR(nnode))
1430 return ERR_PTR(PTR_ERR(nnode));
1431 }
1432 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1433 shft -= UBIFS_LPT_FANOUT_SHIFT;
1434 pnode = ubifs_get_pnode(c, nnode, iip);
1435 if (IS_ERR(pnode))
1436 return ERR_PTR(PTR_ERR(pnode));
1437 iip = (i & (UBIFS_LPT_FANOUT - 1));
1438 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1439 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
1440 pnode->lprops[iip].flags);
1441 return &pnode->lprops[iip];
1442}
1443
1444/**
1445 * dirty_cow_nnode - ensure a nnode is not being committed.
1446 * @c: UBIFS file-system description object
1447 * @nnode: nnode to check
1448 *
1449 * Returns dirtied nnode on success or negative error code on failure.
1450 */
1451static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
1452 struct ubifs_nnode *nnode)
1453{
1454 struct ubifs_nnode *n;
1455 int i;
1456
1457 if (!test_bit(COW_CNODE, &nnode->flags)) {
1458 /* nnode is not being committed */
1459 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
1460 c->dirty_nn_cnt += 1;
1461 ubifs_add_nnode_dirt(c, nnode);
1462 }
1463 return nnode;
1464 }
1465
1466 /* nnode is being committed, so copy it */
1467 n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
1468 if (unlikely(!n))
1469 return ERR_PTR(-ENOMEM);
1470
1471 memcpy(n, nnode, sizeof(struct ubifs_nnode));
1472 n->cnext = NULL;
1473 __set_bit(DIRTY_CNODE, &n->flags);
1474 __clear_bit(COW_CNODE, &n->flags);
1475
1476 /* The children now have new parent */
1477 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1478 struct ubifs_nbranch *branch = &n->nbranch[i];
1479
1480 if (branch->cnode)
1481 branch->cnode->parent = n;
1482 }
1483
1484 ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
1485 __set_bit(OBSOLETE_CNODE, &nnode->flags);
1486
1487 c->dirty_nn_cnt += 1;
1488 ubifs_add_nnode_dirt(c, nnode);
1489 if (nnode->parent)
1490 nnode->parent->nbranch[n->iip].nnode = n;
1491 else
1492 c->nroot = n;
1493 return n;
1494}
1495
1496/**
1497 * dirty_cow_pnode - ensure a pnode is not being committed.
1498 * @c: UBIFS file-system description object
1499 * @pnode: pnode to check
1500 *
1501 * Returns dirtied pnode on success or negative error code on failure.
1502 */
1503static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
1504 struct ubifs_pnode *pnode)
1505{
1506 struct ubifs_pnode *p;
1507
1508 if (!test_bit(COW_CNODE, &pnode->flags)) {
1509 /* pnode is not being committed */
1510 if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
1511 c->dirty_pn_cnt += 1;
1512 add_pnode_dirt(c, pnode);
1513 }
1514 return pnode;
1515 }
1516
1517 /* pnode is being committed, so copy it */
1518 p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
1519 if (unlikely(!p))
1520 return ERR_PTR(-ENOMEM);
1521
1522 memcpy(p, pnode, sizeof(struct ubifs_pnode));
1523 p->cnext = NULL;
1524 __set_bit(DIRTY_CNODE, &p->flags);
1525 __clear_bit(COW_CNODE, &p->flags);
1526 replace_cats(c, pnode, p);
1527
1528 ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
1529 __set_bit(OBSOLETE_CNODE, &pnode->flags);
1530
1531 c->dirty_pn_cnt += 1;
1532 add_pnode_dirt(c, pnode);
1533 pnode->parent->nbranch[p->iip].pnode = p;
1534 return p;
1535}
1536
1537/**
1538 * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
1539 * @c: UBIFS file-system description object
1540 * @lnum: LEB number to lookup
1541 *
1542 * This function returns a pointer to the LEB properties on success or a
1543 * negative error code on failure.
1544 */
1545struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
1546{
1547 int err, i, h, iip, shft;
1548 struct ubifs_nnode *nnode;
1549 struct ubifs_pnode *pnode;
1550
1551 if (!c->nroot) {
1552 err = ubifs_read_nnode(c, NULL, 0);
1553 if (err)
1554 return ERR_PTR(err);
1555 }
1556 nnode = c->nroot;
1557 nnode = dirty_cow_nnode(c, nnode);
1558 if (IS_ERR(nnode))
1559 return ERR_PTR(PTR_ERR(nnode));
1560 i = lnum - c->main_first;
1561 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1562 for (h = 1; h < c->lpt_hght; h++) {
1563 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1564 shft -= UBIFS_LPT_FANOUT_SHIFT;
1565 nnode = ubifs_get_nnode(c, nnode, iip);
1566 if (IS_ERR(nnode))
1567 return ERR_PTR(PTR_ERR(nnode));
1568 nnode = dirty_cow_nnode(c, nnode);
1569 if (IS_ERR(nnode))
1570 return ERR_PTR(PTR_ERR(nnode));
1571 }
1572 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1573 shft -= UBIFS_LPT_FANOUT_SHIFT;
1574 pnode = ubifs_get_pnode(c, nnode, iip);
1575 if (IS_ERR(pnode))
1576 return ERR_PTR(PTR_ERR(pnode));
1577 pnode = dirty_cow_pnode(c, pnode);
1578 if (IS_ERR(pnode))
1579 return ERR_PTR(PTR_ERR(pnode));
1580 iip = (i & (UBIFS_LPT_FANOUT - 1));
1581 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1582 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
1583 pnode->lprops[iip].flags);
1584 ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
1585 return &pnode->lprops[iip];
1586}
1587
1588/**
1589 * lpt_init_rd - initialize the LPT for reading.
1590 * @c: UBIFS file-system description object
1591 *
1592 * This function returns %0 on success and a negative error code on failure.
1593 */
1594static int lpt_init_rd(struct ubifs_info *c)
1595{
1596 int err, i;
1597
1598 c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
1599 if (!c->ltab)
1600 return -ENOMEM;
1601
1602 i = max_t(int, c->nnode_sz, c->pnode_sz);
1603 c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
1604 if (!c->lpt_nod_buf)
1605 return -ENOMEM;
1606
1607 for (i = 0; i < LPROPS_HEAP_CNT; i++) {
1608 c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
1609 GFP_KERNEL);
1610 if (!c->lpt_heap[i].arr)
1611 return -ENOMEM;
1612 c->lpt_heap[i].cnt = 0;
1613 c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
1614 }
1615
1616 c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
1617 if (!c->dirty_idx.arr)
1618 return -ENOMEM;
1619 c->dirty_idx.cnt = 0;
1620 c->dirty_idx.max_cnt = LPT_HEAP_SZ;
1621
1622 err = read_ltab(c);
1623 if (err)
1624 return err;
1625
1626 dbg_lp("space_bits %d", c->space_bits);
1627 dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
1628 dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
1629 dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
1630 dbg_lp("pcnt_bits %d", c->pcnt_bits);
1631 dbg_lp("lnum_bits %d", c->lnum_bits);
1632 dbg_lp("pnode_sz %d", c->pnode_sz);
1633 dbg_lp("nnode_sz %d", c->nnode_sz);
1634 dbg_lp("ltab_sz %d", c->ltab_sz);
1635 dbg_lp("lsave_sz %d", c->lsave_sz);
1636 dbg_lp("lsave_cnt %d", c->lsave_cnt);
1637 dbg_lp("lpt_hght %d", c->lpt_hght);
1638 dbg_lp("big_lpt %d", c->big_lpt);
1639 dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
1640 dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
1641 dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
1642 if (c->big_lpt)
1643 dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
1644
1645 return 0;
1646}
1647
1648/**
1649 * lpt_init_wr - initialize the LPT for writing.
1650 * @c: UBIFS file-system description object
1651 *
1652 * 'lpt_init_rd()' must have been called already.
1653 *
1654 * This function returns %0 on success and a negative error code on failure.
1655 */
1656static int lpt_init_wr(struct ubifs_info *c)
1657{
1658 int err, i;
1659
1660 c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
1661 if (!c->ltab_cmt)
1662 return -ENOMEM;
1663
1664 c->lpt_buf = vmalloc(c->leb_size);
1665 if (!c->lpt_buf)
1666 return -ENOMEM;
1667
1668 if (c->big_lpt) {
1669 c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
1670 if (!c->lsave)
1671 return -ENOMEM;
1672 err = read_lsave(c);
1673 if (err)
1674 return err;
1675 }
1676
1677 for (i = 0; i < c->lpt_lebs; i++)
1678 if (c->ltab[i].free == c->leb_size) {
1679 err = ubifs_leb_unmap(c, i + c->lpt_first);
1680 if (err)
1681 return err;
1682 }
1683
1684 return 0;
1685}
1686
1687/**
1688 * ubifs_lpt_init - initialize the LPT.
1689 * @c: UBIFS file-system description object
1690 * @rd: whether to initialize lpt for reading
1691 * @wr: whether to initialize lpt for writing
1692 *
1693 * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
1694 * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
1695 * true.
1696 *
1697 * This function returns %0 on success and a negative error code on failure.
1698 */
1699int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
1700{
1701 int err;
1702
1703 if (rd) {
1704 err = lpt_init_rd(c);
1705 if (err)
1706 return err;
1707 }
1708
1709 if (wr) {
1710 err = lpt_init_wr(c);
1711 if (err)
1712 return err;
1713 }
1714
1715 return 0;
1716}
1717
1718/**
1719 * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
1720 * @nnode: where to keep a nnode
1721 * @pnode: where to keep a pnode
1722 * @cnode: where to keep a cnode
1723 * @in_tree: is the node in the tree in memory
1724 * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
1725 * the tree
1726 * @ptr.pnode: ditto for pnode
1727 * @ptr.cnode: ditto for cnode
1728 */
1729struct lpt_scan_node {
1730 union {
1731 struct ubifs_nnode nnode;
1732 struct ubifs_pnode pnode;
1733 struct ubifs_cnode cnode;
1734 };
1735 int in_tree;
1736 union {
1737 struct ubifs_nnode *nnode;
1738 struct ubifs_pnode *pnode;
1739 struct ubifs_cnode *cnode;
1740 } ptr;
1741};
1742
1743/**
1744 * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
1745 * @c: the UBIFS file-system description object
1746 * @path: where to put the nnode
1747 * @parent: parent of the nnode
1748 * @iip: index in parent of the nnode
1749 *
1750 * This function returns a pointer to the nnode on success or a negative error
1751 * code on failure.
1752 */
1753static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
1754 struct lpt_scan_node *path,
1755 struct ubifs_nnode *parent, int iip)
1756{
1757 struct ubifs_nbranch *branch;
1758 struct ubifs_nnode *nnode;
1759 void *buf = c->lpt_nod_buf;
1760 int err;
1761
1762 branch = &parent->nbranch[iip];
1763 nnode = branch->nnode;
1764 if (nnode) {
1765 path->in_tree = 1;
1766 path->ptr.nnode = nnode;
1767 return nnode;
1768 }
1769 nnode = &path->nnode;
1770 path->in_tree = 0;
1771 path->ptr.nnode = nnode;
1772 memset(nnode, 0, sizeof(struct ubifs_nnode));
1773 if (branch->lnum == 0) {
1774 /*
1775 * This nnode was not written which just means that the LEB
1776 * properties in the subtree below it describe empty LEBs. We
1777 * make the nnode as though we had read it, which in fact means
1778 * doing almost nothing.
1779 */
1780 if (c->big_lpt)
1781 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1782 } else {
1783 err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
1784 c->nnode_sz);
1785 if (err)
1786 return ERR_PTR(err);
1787 err = unpack_nnode(c, buf, nnode);
1788 if (err)
1789 return ERR_PTR(err);
1790 }
1791 err = validate_nnode(c, nnode, parent, iip);
1792 if (err)
1793 return ERR_PTR(err);
1794 if (!c->big_lpt)
1795 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1796 nnode->level = parent->level - 1;
1797 nnode->parent = parent;
1798 nnode->iip = iip;
1799 return nnode;
1800}
1801
1802/**
1803 * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
1804 * @c: the UBIFS file-system description object
1805 * @path: where to put the pnode
1806 * @parent: parent of the pnode
1807 * @iip: index in parent of the pnode
1808 *
1809 * This function returns a pointer to the pnode on success or a negative error
1810 * code on failure.
1811 */
1812static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
1813 struct lpt_scan_node *path,
1814 struct ubifs_nnode *parent, int iip)
1815{
1816 struct ubifs_nbranch *branch;
1817 struct ubifs_pnode *pnode;
1818 void *buf = c->lpt_nod_buf;
1819 int err;
1820
1821 branch = &parent->nbranch[iip];
1822 pnode = branch->pnode;
1823 if (pnode) {
1824 path->in_tree = 1;
1825 path->ptr.pnode = pnode;
1826 return pnode;
1827 }
1828 pnode = &path->pnode;
1829 path->in_tree = 0;
1830 path->ptr.pnode = pnode;
1831 memset(pnode, 0, sizeof(struct ubifs_pnode));
1832 if (branch->lnum == 0) {
1833 /*
1834 * This pnode was not written which just means that the LEB
1835 * properties in it describe empty LEBs. We make the pnode as
1836 * though we had read it.
1837 */
1838 int i;
1839
1840 if (c->big_lpt)
1841 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1842 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1843 struct ubifs_lprops * const lprops = &pnode->lprops[i];
1844
1845 lprops->free = c->leb_size;
1846 lprops->flags = ubifs_categorize_lprops(c, lprops);
1847 }
1848 } else {
1849 ubifs_assert(branch->lnum >= c->lpt_first &&
1850 branch->lnum <= c->lpt_last);
1851 ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
1852 err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
1853 c->pnode_sz);
1854 if (err)
1855 return ERR_PTR(err);
1856 err = unpack_pnode(c, buf, pnode);
1857 if (err)
1858 return ERR_PTR(err);
1859 }
1860 err = validate_pnode(c, pnode, parent, iip);
1861 if (err)
1862 return ERR_PTR(err);
1863 if (!c->big_lpt)
1864 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1865 pnode->parent = parent;
1866 pnode->iip = iip;
1867 set_pnode_lnum(c, pnode);
1868 return pnode;
1869}
1870
1871/**
1872 * ubifs_lpt_scan_nolock - scan the LPT.
1873 * @c: the UBIFS file-system description object
1874 * @start_lnum: LEB number from which to start scanning
1875 * @end_lnum: LEB number at which to stop scanning
1876 * @scan_cb: callback function called for each lprops
1877 * @data: data to be passed to the callback function
1878 *
1879 * This function returns %0 on success and a negative error code on failure.
1880 */
1881int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
1882 ubifs_lpt_scan_callback scan_cb, void *data)
1883{
1884 int err = 0, i, h, iip, shft;
1885 struct ubifs_nnode *nnode;
1886 struct ubifs_pnode *pnode;
1887 struct lpt_scan_node *path;
1888
1889 if (start_lnum == -1) {
1890 start_lnum = end_lnum + 1;
1891 if (start_lnum >= c->leb_cnt)
1892 start_lnum = c->main_first;
1893 }
1894
1895 ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
1896 ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
1897
1898 if (!c->nroot) {
1899 err = ubifs_read_nnode(c, NULL, 0);
1900 if (err)
1901 return err;
1902 }
1903
1904 path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
1905 GFP_NOFS);
1906 if (!path)
1907 return -ENOMEM;
1908
1909 path[0].ptr.nnode = c->nroot;
1910 path[0].in_tree = 1;
1911again:
1912 /* Descend to the pnode containing start_lnum */
1913 nnode = c->nroot;
1914 i = start_lnum - c->main_first;
1915 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1916 for (h = 1; h < c->lpt_hght; h++) {
1917 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1918 shft -= UBIFS_LPT_FANOUT_SHIFT;
1919 nnode = scan_get_nnode(c, path + h, nnode, iip);
1920 if (IS_ERR(nnode)) {
1921 err = PTR_ERR(nnode);
1922 goto out;
1923 }
1924 }
1925 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1926 shft -= UBIFS_LPT_FANOUT_SHIFT;
1927 pnode = scan_get_pnode(c, path + h, nnode, iip);
1928 if (IS_ERR(pnode)) {
1929 err = PTR_ERR(pnode);
1930 goto out;
1931 }
1932 iip = (i & (UBIFS_LPT_FANOUT - 1));
1933
1934 /* Loop for each lprops */
1935 while (1) {
1936 struct ubifs_lprops *lprops = &pnode->lprops[iip];
1937 int ret, lnum = lprops->lnum;
1938
1939 ret = scan_cb(c, lprops, path[h].in_tree, data);
1940 if (ret < 0) {
1941 err = ret;
1942 goto out;
1943 }
1944 if (ret & LPT_SCAN_ADD) {
1945 /* Add all the nodes in path to the tree in memory */
1946 for (h = 1; h < c->lpt_hght; h++) {
1947 const size_t sz = sizeof(struct ubifs_nnode);
1948 struct ubifs_nnode *parent;
1949
1950 if (path[h].in_tree)
1951 continue;
1952 nnode = kmalloc(sz, GFP_NOFS);
1953 if (!nnode) {
1954 err = -ENOMEM;
1955 goto out;
1956 }
1957 memcpy(nnode, &path[h].nnode, sz);
1958 parent = nnode->parent;
1959 parent->nbranch[nnode->iip].nnode = nnode;
1960 path[h].ptr.nnode = nnode;
1961 path[h].in_tree = 1;
1962 path[h + 1].cnode.parent = nnode;
1963 }
1964 if (path[h].in_tree)
1965 ubifs_ensure_cat(c, lprops);
1966 else {
1967 const size_t sz = sizeof(struct ubifs_pnode);
1968 struct ubifs_nnode *parent;
1969
1970 pnode = kmalloc(sz, GFP_NOFS);
1971 if (!pnode) {
1972 err = -ENOMEM;
1973 goto out;
1974 }
1975 memcpy(pnode, &path[h].pnode, sz);
1976 parent = pnode->parent;
1977 parent->nbranch[pnode->iip].pnode = pnode;
1978 path[h].ptr.pnode = pnode;
1979 path[h].in_tree = 1;
1980 update_cats(c, pnode);
1981 c->pnodes_have += 1;
1982 }
1983 err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
1984 c->nroot, 0, 0);
1985 if (err)
1986 goto out;
1987 err = dbg_check_cats(c);
1988 if (err)
1989 goto out;
1990 }
1991 if (ret & LPT_SCAN_STOP) {
1992 err = 0;
1993 break;
1994 }
1995 /* Get the next lprops */
1996 if (lnum == end_lnum) {
1997 /*
1998 * We got to the end without finding what we were
1999 * looking for
2000 */
2001 err = -ENOSPC;
2002 goto out;
2003 }
2004 if (lnum + 1 >= c->leb_cnt) {
2005 /* Wrap-around to the beginning */
2006 start_lnum = c->main_first;
2007 goto again;
2008 }
2009 if (iip + 1 < UBIFS_LPT_FANOUT) {
2010 /* Next lprops is in the same pnode */
2011 iip += 1;
2012 continue;
2013 }
2014 /* We need to get the next pnode. Go up until we can go right */
2015 iip = pnode->iip;
2016 while (1) {
2017 h -= 1;
2018 ubifs_assert(h >= 0);
2019 nnode = path[h].ptr.nnode;
2020 if (iip + 1 < UBIFS_LPT_FANOUT)
2021 break;
2022 iip = nnode->iip;
2023 }
2024 /* Go right */
2025 iip += 1;
2026 /* Descend to the pnode */
2027 h += 1;
2028 for (; h < c->lpt_hght; h++) {
2029 nnode = scan_get_nnode(c, path + h, nnode, iip);
2030 if (IS_ERR(nnode)) {
2031 err = PTR_ERR(nnode);
2032 goto out;
2033 }
2034 iip = 0;
2035 }
2036 pnode = scan_get_pnode(c, path + h, nnode, iip);
2037 if (IS_ERR(pnode)) {
2038 err = PTR_ERR(pnode);
2039 goto out;
2040 }
2041 iip = 0;
2042 }
2043out:
2044 kfree(path);
2045 return err;
2046}
2047
2048#ifdef CONFIG_UBIFS_FS_DEBUG
2049
2050/**
2051 * dbg_chk_pnode - check a pnode.
2052 * @c: the UBIFS file-system description object
2053 * @pnode: pnode to check
2054 * @col: pnode column
2055 *
2056 * This function returns %0 on success and a negative error code on failure.
2057 */
2058static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
2059 int col)
2060{
2061 int i;
2062
2063 if (pnode->num != col) {
2064 dbg_err("pnode num %d expected %d parent num %d iip %d",
2065 pnode->num, col, pnode->parent->num, pnode->iip);
2066 return -EINVAL;
2067 }
2068 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
2069 struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
2070 int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
2071 c->main_first;
2072 int found, cat = lprops->flags & LPROPS_CAT_MASK;
2073 struct ubifs_lpt_heap *heap;
2074 struct list_head *list = NULL;
2075
2076 if (lnum >= c->leb_cnt)
2077 continue;
2078 if (lprops->lnum != lnum) {
2079 dbg_err("bad LEB number %d expected %d",
2080 lprops->lnum, lnum);
2081 return -EINVAL;
2082 }
2083 if (lprops->flags & LPROPS_TAKEN) {
2084 if (cat != LPROPS_UNCAT) {
2085 dbg_err("LEB %d taken but not uncat %d",
2086 lprops->lnum, cat);
2087 return -EINVAL;
2088 }
2089 continue;
2090 }
2091 if (lprops->flags & LPROPS_INDEX) {
2092 switch (cat) {
2093 case LPROPS_UNCAT:
2094 case LPROPS_DIRTY_IDX:
2095 case LPROPS_FRDI_IDX:
2096 break;
2097 default:
2098 dbg_err("LEB %d index but cat %d",
2099 lprops->lnum, cat);
2100 return -EINVAL;
2101 }
2102 } else {
2103 switch (cat) {
2104 case LPROPS_UNCAT:
2105 case LPROPS_DIRTY:
2106 case LPROPS_FREE:
2107 case LPROPS_EMPTY:
2108 case LPROPS_FREEABLE:
2109 break;
2110 default:
2111 dbg_err("LEB %d not index but cat %d",
2112 lprops->lnum, cat);
2113 return -EINVAL;
2114 }
2115 }
2116 switch (cat) {
2117 case LPROPS_UNCAT:
2118 list = &c->uncat_list;
2119 break;
2120 case LPROPS_EMPTY:
2121 list = &c->empty_list;
2122 break;
2123 case LPROPS_FREEABLE:
2124 list = &c->freeable_list;
2125 break;
2126 case LPROPS_FRDI_IDX:
2127 list = &c->frdi_idx_list;
2128 break;
2129 }
2130 found = 0;
2131 switch (cat) {
2132 case LPROPS_DIRTY:
2133 case LPROPS_DIRTY_IDX:
2134 case LPROPS_FREE:
2135 heap = &c->lpt_heap[cat - 1];
2136 if (lprops->hpos < heap->cnt &&
2137 heap->arr[lprops->hpos] == lprops)
2138 found = 1;
2139 break;
2140 case LPROPS_UNCAT:
2141 case LPROPS_EMPTY:
2142 case LPROPS_FREEABLE:
2143 case LPROPS_FRDI_IDX:
2144 list_for_each_entry(lp, list, list)
2145 if (lprops == lp) {
2146 found = 1;
2147 break;
2148 }
2149 break;
2150 }
2151 if (!found) {
2152 dbg_err("LEB %d cat %d not found in cat heap/list",
2153 lprops->lnum, cat);
2154 return -EINVAL;
2155 }
2156 switch (cat) {
2157 case LPROPS_EMPTY:
2158 if (lprops->free != c->leb_size) {
2159 dbg_err("LEB %d cat %d free %d dirty %d",
2160 lprops->lnum, cat, lprops->free,
2161 lprops->dirty);
2162 return -EINVAL;
2163 }
2164 case LPROPS_FREEABLE:
2165 case LPROPS_FRDI_IDX:
2166 if (lprops->free + lprops->dirty != c->leb_size) {
2167 dbg_err("LEB %d cat %d free %d dirty %d",
2168 lprops->lnum, cat, lprops->free,
2169 lprops->dirty);
2170 return -EINVAL;
2171 }
2172 }
2173 }
2174 return 0;
2175}
2176
2177/**
2178 * dbg_check_lpt_nodes - check nnodes and pnodes.
2179 * @c: the UBIFS file-system description object
2180 * @cnode: next cnode (nnode or pnode) to check
2181 * @row: row of cnode (root is zero)
2182 * @col: column of cnode (leftmost is zero)
2183 *
2184 * This function returns %0 on success and a negative error code on failure.
2185 */
2186int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
2187 int row, int col)
2188{
2189 struct ubifs_nnode *nnode, *nn;
2190 struct ubifs_cnode *cn;
2191 int num, iip = 0, err;
2192
2193 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
2194 return 0;
2195
2196 while (cnode) {
2197 ubifs_assert(row >= 0);
2198 nnode = cnode->parent;
2199 if (cnode->level) {
2200 /* cnode is a nnode */
2201 num = calc_nnode_num(row, col);
2202 if (cnode->num != num) {
2203 dbg_err("nnode num %d expected %d "
2204 "parent num %d iip %d", cnode->num, num,
2205 (nnode ? nnode->num : 0), cnode->iip);
2206 return -EINVAL;
2207 }
2208 nn = (struct ubifs_nnode *)cnode;
2209 while (iip < UBIFS_LPT_FANOUT) {
2210 cn = nn->nbranch[iip].cnode;
2211 if (cn) {
2212 /* Go down */
2213 row += 1;
2214 col <<= UBIFS_LPT_FANOUT_SHIFT;
2215 col += iip;
2216 iip = 0;
2217 cnode = cn;
2218 break;
2219 }
2220 /* Go right */
2221 iip += 1;
2222 }
2223 if (iip < UBIFS_LPT_FANOUT)
2224 continue;
2225 } else {
2226 struct ubifs_pnode *pnode;
2227
2228 /* cnode is a pnode */
2229 pnode = (struct ubifs_pnode *)cnode;
2230 err = dbg_chk_pnode(c, pnode, col);
2231 if (err)
2232 return err;
2233 }
2234 /* Go up and to the right */
2235 row -= 1;
2236 col >>= UBIFS_LPT_FANOUT_SHIFT;
2237 iip = cnode->iip + 1;
2238 cnode = (struct ubifs_cnode *)nnode;
2239 }
2240 return 0;
2241}
2242
2243#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
new file mode 100644
index 000000000000..5f0b83e20af6
--- /dev/null
+++ b/fs/ubifs/lpt_commit.c
@@ -0,0 +1,1648 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements commit-related functionality of the LEB properties
25 * subsystem.
26 */
27
28#include <linux/crc16.h>
29#include "ubifs.h"
30
31/**
32 * first_dirty_cnode - find first dirty cnode.
33 * @c: UBIFS file-system description object
34 * @nnode: nnode at which to start
35 *
36 * This function returns the first dirty cnode or %NULL if there is not one.
37 */
38static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
39{
40 ubifs_assert(nnode);
41 while (1) {
42 int i, cont = 0;
43
44 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
45 struct ubifs_cnode *cnode;
46
47 cnode = nnode->nbranch[i].cnode;
48 if (cnode &&
49 test_bit(DIRTY_CNODE, &cnode->flags)) {
50 if (cnode->level == 0)
51 return cnode;
52 nnode = (struct ubifs_nnode *)cnode;
53 cont = 1;
54 break;
55 }
56 }
57 if (!cont)
58 return (struct ubifs_cnode *)nnode;
59 }
60}
61
62/**
63 * next_dirty_cnode - find next dirty cnode.
64 * @cnode: cnode from which to begin searching
65 *
66 * This function returns the next dirty cnode or %NULL if there is not one.
67 */
68static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
69{
70 struct ubifs_nnode *nnode;
71 int i;
72
73 ubifs_assert(cnode);
74 nnode = cnode->parent;
75 if (!nnode)
76 return NULL;
77 for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
78 cnode = nnode->nbranch[i].cnode;
79 if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
80 if (cnode->level == 0)
81 return cnode; /* cnode is a pnode */
82 /* cnode is a nnode */
83 return first_dirty_cnode((struct ubifs_nnode *)cnode);
84 }
85 }
86 return (struct ubifs_cnode *)nnode;
87}
88
89/**
90 * get_cnodes_to_commit - create list of dirty cnodes to commit.
91 * @c: UBIFS file-system description object
92 *
93 * This function returns the number of cnodes to commit.
94 */
95static int get_cnodes_to_commit(struct ubifs_info *c)
96{
97 struct ubifs_cnode *cnode, *cnext;
98 int cnt = 0;
99
100 if (!c->nroot)
101 return 0;
102
103 if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
104 return 0;
105
106 c->lpt_cnext = first_dirty_cnode(c->nroot);
107 cnode = c->lpt_cnext;
108 if (!cnode)
109 return 0;
110 cnt += 1;
111 while (1) {
112 ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
113 __set_bit(COW_ZNODE, &cnode->flags);
114 cnext = next_dirty_cnode(cnode);
115 if (!cnext) {
116 cnode->cnext = c->lpt_cnext;
117 break;
118 }
119 cnode->cnext = cnext;
120 cnode = cnext;
121 cnt += 1;
122 }
123 dbg_cmt("committing %d cnodes", cnt);
124 dbg_lp("committing %d cnodes", cnt);
125 ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
126 return cnt;
127}
128
129/**
130 * upd_ltab - update LPT LEB properties.
131 * @c: UBIFS file-system description object
132 * @lnum: LEB number
133 * @free: amount of free space
134 * @dirty: amount of dirty space to add
135 */
136static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
137{
138 dbg_lp("LEB %d free %d dirty %d to %d +%d",
139 lnum, c->ltab[lnum - c->lpt_first].free,
140 c->ltab[lnum - c->lpt_first].dirty, free, dirty);
141 ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
142 c->ltab[lnum - c->lpt_first].free = free;
143 c->ltab[lnum - c->lpt_first].dirty += dirty;
144}
145
146/**
147 * alloc_lpt_leb - allocate an LPT LEB that is empty.
148 * @c: UBIFS file-system description object
149 * @lnum: LEB number is passed and returned here
150 *
151 * This function finds the next empty LEB in the ltab starting from @lnum. If a
152 * an empty LEB is found it is returned in @lnum and the function returns %0.
153 * Otherwise the function returns -ENOSPC. Note however, that LPT is designed
154 * never to run out of space.
155 */
156static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
157{
158 int i, n;
159
160 n = *lnum - c->lpt_first + 1;
161 for (i = n; i < c->lpt_lebs; i++) {
162 if (c->ltab[i].tgc || c->ltab[i].cmt)
163 continue;
164 if (c->ltab[i].free == c->leb_size) {
165 c->ltab[i].cmt = 1;
166 *lnum = i + c->lpt_first;
167 return 0;
168 }
169 }
170
171 for (i = 0; i < n; i++) {
172 if (c->ltab[i].tgc || c->ltab[i].cmt)
173 continue;
174 if (c->ltab[i].free == c->leb_size) {
175 c->ltab[i].cmt = 1;
176 *lnum = i + c->lpt_first;
177 return 0;
178 }
179 }
180 dbg_err("last LEB %d", *lnum);
181 dump_stack();
182 return -ENOSPC;
183}
184
185/**
186 * layout_cnodes - layout cnodes for commit.
187 * @c: UBIFS file-system description object
188 *
189 * This function returns %0 on success and a negative error code on failure.
190 */
191static int layout_cnodes(struct ubifs_info *c)
192{
193 int lnum, offs, len, alen, done_lsave, done_ltab, err;
194 struct ubifs_cnode *cnode;
195
196 cnode = c->lpt_cnext;
197 if (!cnode)
198 return 0;
199 lnum = c->nhead_lnum;
200 offs = c->nhead_offs;
201 /* Try to place lsave and ltab nicely */
202 done_lsave = !c->big_lpt;
203 done_ltab = 0;
204 if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
205 done_lsave = 1;
206 c->lsave_lnum = lnum;
207 c->lsave_offs = offs;
208 offs += c->lsave_sz;
209 }
210
211 if (offs + c->ltab_sz <= c->leb_size) {
212 done_ltab = 1;
213 c->ltab_lnum = lnum;
214 c->ltab_offs = offs;
215 offs += c->ltab_sz;
216 }
217
218 do {
219 if (cnode->level) {
220 len = c->nnode_sz;
221 c->dirty_nn_cnt -= 1;
222 } else {
223 len = c->pnode_sz;
224 c->dirty_pn_cnt -= 1;
225 }
226 while (offs + len > c->leb_size) {
227 alen = ALIGN(offs, c->min_io_size);
228 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
229 err = alloc_lpt_leb(c, &lnum);
230 if (err)
231 return err;
232 offs = 0;
233 ubifs_assert(lnum >= c->lpt_first &&
234 lnum <= c->lpt_last);
235 /* Try to place lsave and ltab nicely */
236 if (!done_lsave) {
237 done_lsave = 1;
238 c->lsave_lnum = lnum;
239 c->lsave_offs = offs;
240 offs += c->lsave_sz;
241 continue;
242 }
243 if (!done_ltab) {
244 done_ltab = 1;
245 c->ltab_lnum = lnum;
246 c->ltab_offs = offs;
247 offs += c->ltab_sz;
248 continue;
249 }
250 break;
251 }
252 if (cnode->parent) {
253 cnode->parent->nbranch[cnode->iip].lnum = lnum;
254 cnode->parent->nbranch[cnode->iip].offs = offs;
255 } else {
256 c->lpt_lnum = lnum;
257 c->lpt_offs = offs;
258 }
259 offs += len;
260 cnode = cnode->cnext;
261 } while (cnode && cnode != c->lpt_cnext);
262
263 /* Make sure to place LPT's save table */
264 if (!done_lsave) {
265 if (offs + c->lsave_sz > c->leb_size) {
266 alen = ALIGN(offs, c->min_io_size);
267 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
268 err = alloc_lpt_leb(c, &lnum);
269 if (err)
270 return err;
271 offs = 0;
272 ubifs_assert(lnum >= c->lpt_first &&
273 lnum <= c->lpt_last);
274 }
275 done_lsave = 1;
276 c->lsave_lnum = lnum;
277 c->lsave_offs = offs;
278 offs += c->lsave_sz;
279 }
280
281 /* Make sure to place LPT's own lprops table */
282 if (!done_ltab) {
283 if (offs + c->ltab_sz > c->leb_size) {
284 alen = ALIGN(offs, c->min_io_size);
285 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
286 err = alloc_lpt_leb(c, &lnum);
287 if (err)
288 return err;
289 offs = 0;
290 ubifs_assert(lnum >= c->lpt_first &&
291 lnum <= c->lpt_last);
292 }
293 done_ltab = 1;
294 c->ltab_lnum = lnum;
295 c->ltab_offs = offs;
296 offs += c->ltab_sz;
297 }
298
299 alen = ALIGN(offs, c->min_io_size);
300 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
301 return 0;
302}
303
304/**
305 * realloc_lpt_leb - allocate an LPT LEB that is empty.
306 * @c: UBIFS file-system description object
307 * @lnum: LEB number is passed and returned here
308 *
309 * This function duplicates exactly the results of the function alloc_lpt_leb.
310 * It is used during end commit to reallocate the same LEB numbers that were
311 * allocated by alloc_lpt_leb during start commit.
312 *
313 * This function finds the next LEB that was allocated by the alloc_lpt_leb
314 * function starting from @lnum. If a LEB is found it is returned in @lnum and
315 * the function returns %0. Otherwise the function returns -ENOSPC.
316 * Note however, that LPT is designed never to run out of space.
317 */
318static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
319{
320 int i, n;
321
322 n = *lnum - c->lpt_first + 1;
323 for (i = n; i < c->lpt_lebs; i++)
324 if (c->ltab[i].cmt) {
325 c->ltab[i].cmt = 0;
326 *lnum = i + c->lpt_first;
327 return 0;
328 }
329
330 for (i = 0; i < n; i++)
331 if (c->ltab[i].cmt) {
332 c->ltab[i].cmt = 0;
333 *lnum = i + c->lpt_first;
334 return 0;
335 }
336 dbg_err("last LEB %d", *lnum);
337 dump_stack();
338 return -ENOSPC;
339}
340
341/**
342 * write_cnodes - write cnodes for commit.
343 * @c: UBIFS file-system description object
344 *
345 * This function returns %0 on success and a negative error code on failure.
346 */
347static int write_cnodes(struct ubifs_info *c)
348{
349 int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
350 struct ubifs_cnode *cnode;
351 void *buf = c->lpt_buf;
352
353 cnode = c->lpt_cnext;
354 if (!cnode)
355 return 0;
356 lnum = c->nhead_lnum;
357 offs = c->nhead_offs;
358 from = offs;
359 /* Ensure empty LEB is unmapped */
360 if (offs == 0) {
361 err = ubifs_leb_unmap(c, lnum);
362 if (err)
363 return err;
364 }
365 /* Try to place lsave and ltab nicely */
366 done_lsave = !c->big_lpt;
367 done_ltab = 0;
368 if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
369 done_lsave = 1;
370 ubifs_pack_lsave(c, buf + offs, c->lsave);
371 offs += c->lsave_sz;
372 }
373
374 if (offs + c->ltab_sz <= c->leb_size) {
375 done_ltab = 1;
376 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
377 offs += c->ltab_sz;
378 }
379
380 /* Loop for each cnode */
381 do {
382 if (cnode->level)
383 len = c->nnode_sz;
384 else
385 len = c->pnode_sz;
386 while (offs + len > c->leb_size) {
387 wlen = offs - from;
388 if (wlen) {
389 alen = ALIGN(wlen, c->min_io_size);
390 memset(buf + offs, 0xff, alen - wlen);
391 err = ubifs_leb_write(c, lnum, buf + from, from,
392 alen, UBI_SHORTTERM);
393 if (err)
394 return err;
395 }
396 err = realloc_lpt_leb(c, &lnum);
397 if (err)
398 return err;
399 offs = 0;
400 from = 0;
401 ubifs_assert(lnum >= c->lpt_first &&
402 lnum <= c->lpt_last);
403 err = ubifs_leb_unmap(c, lnum);
404 if (err)
405 return err;
406 /* Try to place lsave and ltab nicely */
407 if (!done_lsave) {
408 done_lsave = 1;
409 ubifs_pack_lsave(c, buf + offs, c->lsave);
410 offs += c->lsave_sz;
411 continue;
412 }
413 if (!done_ltab) {
414 done_ltab = 1;
415 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
416 offs += c->ltab_sz;
417 continue;
418 }
419 break;
420 }
421 if (cnode->level)
422 ubifs_pack_nnode(c, buf + offs,
423 (struct ubifs_nnode *)cnode);
424 else
425 ubifs_pack_pnode(c, buf + offs,
426 (struct ubifs_pnode *)cnode);
427 /*
428 * The reason for the barriers is the same as in case of TNC.
429 * See comment in 'write_index()'. 'dirty_cow_nnode()' and
430 * 'dirty_cow_pnode()' are the functions for which this is
431 * important.
432 */
433 clear_bit(DIRTY_CNODE, &cnode->flags);
434 smp_mb__before_clear_bit();
435 clear_bit(COW_ZNODE, &cnode->flags);
436 smp_mb__after_clear_bit();
437 offs += len;
438 cnode = cnode->cnext;
439 } while (cnode && cnode != c->lpt_cnext);
440
441 /* Make sure to place LPT's save table */
442 if (!done_lsave) {
443 if (offs + c->lsave_sz > c->leb_size) {
444 wlen = offs - from;
445 alen = ALIGN(wlen, c->min_io_size);
446 memset(buf + offs, 0xff, alen - wlen);
447 err = ubifs_leb_write(c, lnum, buf + from, from, alen,
448 UBI_SHORTTERM);
449 if (err)
450 return err;
451 err = realloc_lpt_leb(c, &lnum);
452 if (err)
453 return err;
454 offs = 0;
455 ubifs_assert(lnum >= c->lpt_first &&
456 lnum <= c->lpt_last);
457 err = ubifs_leb_unmap(c, lnum);
458 if (err)
459 return err;
460 }
461 done_lsave = 1;
462 ubifs_pack_lsave(c, buf + offs, c->lsave);
463 offs += c->lsave_sz;
464 }
465
466 /* Make sure to place LPT's own lprops table */
467 if (!done_ltab) {
468 if (offs + c->ltab_sz > c->leb_size) {
469 wlen = offs - from;
470 alen = ALIGN(wlen, c->min_io_size);
471 memset(buf + offs, 0xff, alen - wlen);
472 err = ubifs_leb_write(c, lnum, buf + from, from, alen,
473 UBI_SHORTTERM);
474 if (err)
475 return err;
476 err = realloc_lpt_leb(c, &lnum);
477 if (err)
478 return err;
479 offs = 0;
480 ubifs_assert(lnum >= c->lpt_first &&
481 lnum <= c->lpt_last);
482 err = ubifs_leb_unmap(c, lnum);
483 if (err)
484 return err;
485 }
486 done_ltab = 1;
487 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
488 offs += c->ltab_sz;
489 }
490
491 /* Write remaining data in buffer */
492 wlen = offs - from;
493 alen = ALIGN(wlen, c->min_io_size);
494 memset(buf + offs, 0xff, alen - wlen);
495 err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
496 if (err)
497 return err;
498 c->nhead_lnum = lnum;
499 c->nhead_offs = ALIGN(offs, c->min_io_size);
500
501 dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
502 dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
503 dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
504 if (c->big_lpt)
505 dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
506 return 0;
507}
508
509/**
510 * next_pnode - find next pnode.
511 * @c: UBIFS file-system description object
512 * @pnode: pnode
513 *
514 * This function returns the next pnode or %NULL if there are no more pnodes.
515 */
516static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
517 struct ubifs_pnode *pnode)
518{
519 struct ubifs_nnode *nnode;
520 int iip;
521
522 /* Try to go right */
523 nnode = pnode->parent;
524 iip = pnode->iip + 1;
525 if (iip < UBIFS_LPT_FANOUT) {
526 /* We assume here that LEB zero is never an LPT LEB */
527 if (nnode->nbranch[iip].lnum)
528 return ubifs_get_pnode(c, nnode, iip);
529 else
530 return NULL;
531 }
532
533 /* Go up while can't go right */
534 do {
535 iip = nnode->iip + 1;
536 nnode = nnode->parent;
537 if (!nnode)
538 return NULL;
539 /* We assume here that LEB zero is never an LPT LEB */
540 } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
541
542 /* Go right */
543 nnode = ubifs_get_nnode(c, nnode, iip);
544 if (IS_ERR(nnode))
545 return (void *)nnode;
546
547 /* Go down to level 1 */
548 while (nnode->level > 1) {
549 nnode = ubifs_get_nnode(c, nnode, 0);
550 if (IS_ERR(nnode))
551 return (void *)nnode;
552 }
553
554 return ubifs_get_pnode(c, nnode, 0);
555}
556
557/**
558 * pnode_lookup - lookup a pnode in the LPT.
559 * @c: UBIFS file-system description object
560 * @i: pnode number (0 to main_lebs - 1)
561 *
562 * This function returns a pointer to the pnode on success or a negative
563 * error code on failure.
564 */
565static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
566{
567 int err, h, iip, shft;
568 struct ubifs_nnode *nnode;
569
570 if (!c->nroot) {
571 err = ubifs_read_nnode(c, NULL, 0);
572 if (err)
573 return ERR_PTR(err);
574 }
575 i <<= UBIFS_LPT_FANOUT_SHIFT;
576 nnode = c->nroot;
577 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
578 for (h = 1; h < c->lpt_hght; h++) {
579 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
580 shft -= UBIFS_LPT_FANOUT_SHIFT;
581 nnode = ubifs_get_nnode(c, nnode, iip);
582 if (IS_ERR(nnode))
583 return ERR_PTR(PTR_ERR(nnode));
584 }
585 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
586 return ubifs_get_pnode(c, nnode, iip);
587}
588
589/**
590 * add_pnode_dirt - add dirty space to LPT LEB properties.
591 * @c: UBIFS file-system description object
592 * @pnode: pnode for which to add dirt
593 */
594static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
595{
596 ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
597 c->pnode_sz);
598}
599
600/**
601 * do_make_pnode_dirty - mark a pnode dirty.
602 * @c: UBIFS file-system description object
603 * @pnode: pnode to mark dirty
604 */
605static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
606{
607 /* Assumes cnext list is empty i.e. not called during commit */
608 if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
609 struct ubifs_nnode *nnode;
610
611 c->dirty_pn_cnt += 1;
612 add_pnode_dirt(c, pnode);
613 /* Mark parent and ancestors dirty too */
614 nnode = pnode->parent;
615 while (nnode) {
616 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
617 c->dirty_nn_cnt += 1;
618 ubifs_add_nnode_dirt(c, nnode);
619 nnode = nnode->parent;
620 } else
621 break;
622 }
623 }
624}
625
626/**
627 * make_tree_dirty - mark the entire LEB properties tree dirty.
628 * @c: UBIFS file-system description object
629 *
630 * This function is used by the "small" LPT model to cause the entire LEB
631 * properties tree to be written. The "small" LPT model does not use LPT
632 * garbage collection because it is more efficient to write the entire tree
633 * (because it is small).
634 *
635 * This function returns %0 on success and a negative error code on failure.
636 */
637static int make_tree_dirty(struct ubifs_info *c)
638{
639 struct ubifs_pnode *pnode;
640
641 pnode = pnode_lookup(c, 0);
642 while (pnode) {
643 do_make_pnode_dirty(c, pnode);
644 pnode = next_pnode(c, pnode);
645 if (IS_ERR(pnode))
646 return PTR_ERR(pnode);
647 }
648 return 0;
649}
650
651/**
652 * need_write_all - determine if the LPT area is running out of free space.
653 * @c: UBIFS file-system description object
654 *
655 * This function returns %1 if the LPT area is running out of free space and %0
656 * if it is not.
657 */
658static int need_write_all(struct ubifs_info *c)
659{
660 long long free = 0;
661 int i;
662
663 for (i = 0; i < c->lpt_lebs; i++) {
664 if (i + c->lpt_first == c->nhead_lnum)
665 free += c->leb_size - c->nhead_offs;
666 else if (c->ltab[i].free == c->leb_size)
667 free += c->leb_size;
668 else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
669 free += c->leb_size;
670 }
671 /* Less than twice the size left */
672 if (free <= c->lpt_sz * 2)
673 return 1;
674 return 0;
675}
676
677/**
678 * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
679 * @c: UBIFS file-system description object
680 *
681 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
682 * free space and so may be reused as soon as the next commit is completed.
683 * This function is called during start commit to mark LPT LEBs for trivial GC.
684 */
685static void lpt_tgc_start(struct ubifs_info *c)
686{
687 int i;
688
689 for (i = 0; i < c->lpt_lebs; i++) {
690 if (i + c->lpt_first == c->nhead_lnum)
691 continue;
692 if (c->ltab[i].dirty > 0 &&
693 c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
694 c->ltab[i].tgc = 1;
695 c->ltab[i].free = c->leb_size;
696 c->ltab[i].dirty = 0;
697 dbg_lp("LEB %d", i + c->lpt_first);
698 }
699 }
700}
701
702/**
703 * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
704 * @c: UBIFS file-system description object
705 *
706 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
707 * free space and so may be reused as soon as the next commit is completed.
708 * This function is called after the commit is completed (master node has been
709 * written) and unmaps LPT LEBs that were marked for trivial GC.
710 */
711static int lpt_tgc_end(struct ubifs_info *c)
712{
713 int i, err;
714
715 for (i = 0; i < c->lpt_lebs; i++)
716 if (c->ltab[i].tgc) {
717 err = ubifs_leb_unmap(c, i + c->lpt_first);
718 if (err)
719 return err;
720 c->ltab[i].tgc = 0;
721 dbg_lp("LEB %d", i + c->lpt_first);
722 }
723 return 0;
724}
725
726/**
727 * populate_lsave - fill the lsave array with important LEB numbers.
728 * @c: the UBIFS file-system description object
729 *
730 * This function is only called for the "big" model. It records a small number
731 * of LEB numbers of important LEBs. Important LEBs are ones that are (from
732 * most important to least important): empty, freeable, freeable index, dirty
733 * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
734 * their pnodes into memory. That will stop us from having to scan the LPT
735 * straight away. For the "small" model we assume that scanning the LPT is no
736 * big deal.
737 */
738static void populate_lsave(struct ubifs_info *c)
739{
740 struct ubifs_lprops *lprops;
741 struct ubifs_lpt_heap *heap;
742 int i, cnt = 0;
743
744 ubifs_assert(c->big_lpt);
745 if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
746 c->lpt_drty_flgs |= LSAVE_DIRTY;
747 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
748 }
749 list_for_each_entry(lprops, &c->empty_list, list) {
750 c->lsave[cnt++] = lprops->lnum;
751 if (cnt >= c->lsave_cnt)
752 return;
753 }
754 list_for_each_entry(lprops, &c->freeable_list, list) {
755 c->lsave[cnt++] = lprops->lnum;
756 if (cnt >= c->lsave_cnt)
757 return;
758 }
759 list_for_each_entry(lprops, &c->frdi_idx_list, list) {
760 c->lsave[cnt++] = lprops->lnum;
761 if (cnt >= c->lsave_cnt)
762 return;
763 }
764 heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
765 for (i = 0; i < heap->cnt; i++) {
766 c->lsave[cnt++] = heap->arr[i]->lnum;
767 if (cnt >= c->lsave_cnt)
768 return;
769 }
770 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
771 for (i = 0; i < heap->cnt; i++) {
772 c->lsave[cnt++] = heap->arr[i]->lnum;
773 if (cnt >= c->lsave_cnt)
774 return;
775 }
776 heap = &c->lpt_heap[LPROPS_FREE - 1];
777 for (i = 0; i < heap->cnt; i++) {
778 c->lsave[cnt++] = heap->arr[i]->lnum;
779 if (cnt >= c->lsave_cnt)
780 return;
781 }
782 /* Fill it up completely */
783 while (cnt < c->lsave_cnt)
784 c->lsave[cnt++] = c->main_first;
785}
786
787/**
788 * nnode_lookup - lookup a nnode in the LPT.
789 * @c: UBIFS file-system description object
790 * @i: nnode number
791 *
792 * This function returns a pointer to the nnode on success or a negative
793 * error code on failure.
794 */
795static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
796{
797 int err, iip;
798 struct ubifs_nnode *nnode;
799
800 if (!c->nroot) {
801 err = ubifs_read_nnode(c, NULL, 0);
802 if (err)
803 return ERR_PTR(err);
804 }
805 nnode = c->nroot;
806 while (1) {
807 iip = i & (UBIFS_LPT_FANOUT - 1);
808 i >>= UBIFS_LPT_FANOUT_SHIFT;
809 if (!i)
810 break;
811 nnode = ubifs_get_nnode(c, nnode, iip);
812 if (IS_ERR(nnode))
813 return nnode;
814 }
815 return nnode;
816}
817
818/**
819 * make_nnode_dirty - find a nnode and, if found, make it dirty.
820 * @c: UBIFS file-system description object
821 * @node_num: nnode number of nnode to make dirty
822 * @lnum: LEB number where nnode was written
823 * @offs: offset where nnode was written
824 *
825 * This function is used by LPT garbage collection. LPT garbage collection is
826 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
827 * simply involves marking all the nodes in the LEB being garbage-collected as
828 * dirty. The dirty nodes are written next commit, after which the LEB is free
829 * to be reused.
830 *
831 * This function returns %0 on success and a negative error code on failure.
832 */
833static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
834 int offs)
835{
836 struct ubifs_nnode *nnode;
837
838 nnode = nnode_lookup(c, node_num);
839 if (IS_ERR(nnode))
840 return PTR_ERR(nnode);
841 if (nnode->parent) {
842 struct ubifs_nbranch *branch;
843
844 branch = &nnode->parent->nbranch[nnode->iip];
845 if (branch->lnum != lnum || branch->offs != offs)
846 return 0; /* nnode is obsolete */
847 } else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
848 return 0; /* nnode is obsolete */
849 /* Assumes cnext list is empty i.e. not called during commit */
850 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
851 c->dirty_nn_cnt += 1;
852 ubifs_add_nnode_dirt(c, nnode);
853 /* Mark parent and ancestors dirty too */
854 nnode = nnode->parent;
855 while (nnode) {
856 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
857 c->dirty_nn_cnt += 1;
858 ubifs_add_nnode_dirt(c, nnode);
859 nnode = nnode->parent;
860 } else
861 break;
862 }
863 }
864 return 0;
865}
866
867/**
868 * make_pnode_dirty - find a pnode and, if found, make it dirty.
869 * @c: UBIFS file-system description object
870 * @node_num: pnode number of pnode to make dirty
871 * @lnum: LEB number where pnode was written
872 * @offs: offset where pnode was written
873 *
874 * This function is used by LPT garbage collection. LPT garbage collection is
875 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
876 * simply involves marking all the nodes in the LEB being garbage-collected as
877 * dirty. The dirty nodes are written next commit, after which the LEB is free
878 * to be reused.
879 *
880 * This function returns %0 on success and a negative error code on failure.
881 */
882static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
883 int offs)
884{
885 struct ubifs_pnode *pnode;
886 struct ubifs_nbranch *branch;
887
888 pnode = pnode_lookup(c, node_num);
889 if (IS_ERR(pnode))
890 return PTR_ERR(pnode);
891 branch = &pnode->parent->nbranch[pnode->iip];
892 if (branch->lnum != lnum || branch->offs != offs)
893 return 0;
894 do_make_pnode_dirty(c, pnode);
895 return 0;
896}
897
898/**
899 * make_ltab_dirty - make ltab node dirty.
900 * @c: UBIFS file-system description object
901 * @lnum: LEB number where ltab was written
902 * @offs: offset where ltab was written
903 *
904 * This function is used by LPT garbage collection. LPT garbage collection is
905 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
906 * simply involves marking all the nodes in the LEB being garbage-collected as
907 * dirty. The dirty nodes are written next commit, after which the LEB is free
908 * to be reused.
909 *
910 * This function returns %0 on success and a negative error code on failure.
911 */
912static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
913{
914 if (lnum != c->ltab_lnum || offs != c->ltab_offs)
915 return 0; /* This ltab node is obsolete */
916 if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
917 c->lpt_drty_flgs |= LTAB_DIRTY;
918 ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
919 }
920 return 0;
921}
922
923/**
924 * make_lsave_dirty - make lsave node dirty.
925 * @c: UBIFS file-system description object
926 * @lnum: LEB number where lsave was written
927 * @offs: offset where lsave was written
928 *
929 * This function is used by LPT garbage collection. LPT garbage collection is
930 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
931 * simply involves marking all the nodes in the LEB being garbage-collected as
932 * dirty. The dirty nodes are written next commit, after which the LEB is free
933 * to be reused.
934 *
935 * This function returns %0 on success and a negative error code on failure.
936 */
937static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
938{
939 if (lnum != c->lsave_lnum || offs != c->lsave_offs)
940 return 0; /* This lsave node is obsolete */
941 if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
942 c->lpt_drty_flgs |= LSAVE_DIRTY;
943 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
944 }
945 return 0;
946}
947
948/**
949 * make_node_dirty - make node dirty.
950 * @c: UBIFS file-system description object
951 * @node_type: LPT node type
952 * @node_num: node number
953 * @lnum: LEB number where node was written
954 * @offs: offset where node was written
955 *
956 * This function is used by LPT garbage collection. LPT garbage collection is
957 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
958 * simply involves marking all the nodes in the LEB being garbage-collected as
959 * dirty. The dirty nodes are written next commit, after which the LEB is free
960 * to be reused.
961 *
962 * This function returns %0 on success and a negative error code on failure.
963 */
964static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
965 int lnum, int offs)
966{
967 switch (node_type) {
968 case UBIFS_LPT_NNODE:
969 return make_nnode_dirty(c, node_num, lnum, offs);
970 case UBIFS_LPT_PNODE:
971 return make_pnode_dirty(c, node_num, lnum, offs);
972 case UBIFS_LPT_LTAB:
973 return make_ltab_dirty(c, lnum, offs);
974 case UBIFS_LPT_LSAVE:
975 return make_lsave_dirty(c, lnum, offs);
976 }
977 return -EINVAL;
978}
979
980/**
981 * get_lpt_node_len - return the length of a node based on its type.
982 * @c: UBIFS file-system description object
983 * @node_type: LPT node type
984 */
985static int get_lpt_node_len(struct ubifs_info *c, int node_type)
986{
987 switch (node_type) {
988 case UBIFS_LPT_NNODE:
989 return c->nnode_sz;
990 case UBIFS_LPT_PNODE:
991 return c->pnode_sz;
992 case UBIFS_LPT_LTAB:
993 return c->ltab_sz;
994 case UBIFS_LPT_LSAVE:
995 return c->lsave_sz;
996 }
997 return 0;
998}
999
1000/**
1001 * get_pad_len - return the length of padding in a buffer.
1002 * @c: UBIFS file-system description object
1003 * @buf: buffer
1004 * @len: length of buffer
1005 */
1006static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
1007{
1008 int offs, pad_len;
1009
1010 if (c->min_io_size == 1)
1011 return 0;
1012 offs = c->leb_size - len;
1013 pad_len = ALIGN(offs, c->min_io_size) - offs;
1014 return pad_len;
1015}
1016
1017/**
1018 * get_lpt_node_type - return type (and node number) of a node in a buffer.
1019 * @c: UBIFS file-system description object
1020 * @buf: buffer
1021 * @node_num: node number is returned here
1022 */
1023static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
1024{
1025 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1026 int pos = 0, node_type;
1027
1028 node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
1029 *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
1030 return node_type;
1031}
1032
1033/**
1034 * is_a_node - determine if a buffer contains a node.
1035 * @c: UBIFS file-system description object
1036 * @buf: buffer
1037 * @len: length of buffer
1038 *
1039 * This function returns %1 if the buffer contains a node or %0 if it does not.
1040 */
1041static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
1042{
1043 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1044 int pos = 0, node_type, node_len;
1045 uint16_t crc, calc_crc;
1046
1047 node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
1048 if (node_type == UBIFS_LPT_NOT_A_NODE)
1049 return 0;
1050 node_len = get_lpt_node_len(c, node_type);
1051 if (!node_len || node_len > len)
1052 return 0;
1053 pos = 0;
1054 addr = buf;
1055 crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
1056 calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
1057 node_len - UBIFS_LPT_CRC_BYTES);
1058 if (crc != calc_crc)
1059 return 0;
1060 return 1;
1061}
1062
1063
1064/**
1065 * lpt_gc_lnum - garbage collect a LPT LEB.
1066 * @c: UBIFS file-system description object
1067 * @lnum: LEB number to garbage collect
1068 *
1069 * LPT garbage collection is used only for the "big" LPT model
1070 * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes
1071 * in the LEB being garbage-collected as dirty. The dirty nodes are written
1072 * next commit, after which the LEB is free to be reused.
1073 *
1074 * This function returns %0 on success and a negative error code on failure.
1075 */
1076static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
1077{
1078 int err, len = c->leb_size, node_type, node_num, node_len, offs;
1079 void *buf = c->lpt_buf;
1080
1081 dbg_lp("LEB %d", lnum);
1082 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1083 if (err) {
1084 ubifs_err("cannot read LEB %d, error %d", lnum, err);
1085 return err;
1086 }
1087 while (1) {
1088 if (!is_a_node(c, buf, len)) {
1089 int pad_len;
1090
1091 pad_len = get_pad_len(c, buf, len);
1092 if (pad_len) {
1093 buf += pad_len;
1094 len -= pad_len;
1095 continue;
1096 }
1097 return 0;
1098 }
1099 node_type = get_lpt_node_type(c, buf, &node_num);
1100 node_len = get_lpt_node_len(c, node_type);
1101 offs = c->leb_size - len;
1102 ubifs_assert(node_len != 0);
1103 mutex_lock(&c->lp_mutex);
1104 err = make_node_dirty(c, node_type, node_num, lnum, offs);
1105 mutex_unlock(&c->lp_mutex);
1106 if (err)
1107 return err;
1108 buf += node_len;
1109 len -= node_len;
1110 }
1111 return 0;
1112}
1113
1114/**
1115 * lpt_gc - LPT garbage collection.
1116 * @c: UBIFS file-system description object
1117 *
1118 * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
1119 * Returns %0 on success and a negative error code on failure.
1120 */
1121static int lpt_gc(struct ubifs_info *c)
1122{
1123 int i, lnum = -1, dirty = 0;
1124
1125 mutex_lock(&c->lp_mutex);
1126 for (i = 0; i < c->lpt_lebs; i++) {
1127 ubifs_assert(!c->ltab[i].tgc);
1128 if (i + c->lpt_first == c->nhead_lnum ||
1129 c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
1130 continue;
1131 if (c->ltab[i].dirty > dirty) {
1132 dirty = c->ltab[i].dirty;
1133 lnum = i + c->lpt_first;
1134 }
1135 }
1136 mutex_unlock(&c->lp_mutex);
1137 if (lnum == -1)
1138 return -ENOSPC;
1139 return lpt_gc_lnum(c, lnum);
1140}
1141
1142/**
1143 * ubifs_lpt_start_commit - UBIFS commit starts.
1144 * @c: the UBIFS file-system description object
1145 *
1146 * This function has to be called when UBIFS starts the commit operation.
1147 * This function "freezes" all currently dirty LEB properties and does not
1148 * change them anymore. Further changes are saved and tracked separately
1149 * because they are not part of this commit. This function returns zero in case
1150 * of success and a negative error code in case of failure.
1151 */
1152int ubifs_lpt_start_commit(struct ubifs_info *c)
1153{
1154 int err, cnt;
1155
1156 dbg_lp("");
1157
1158 mutex_lock(&c->lp_mutex);
1159 err = dbg_check_ltab(c);
1160 if (err)
1161 goto out;
1162
1163 if (c->check_lpt_free) {
1164 /*
1165 * We ensure there is enough free space in
1166 * ubifs_lpt_post_commit() by marking nodes dirty. That
1167 * information is lost when we unmount, so we also need
1168 * to check free space once after mounting also.
1169 */
1170 c->check_lpt_free = 0;
1171 while (need_write_all(c)) {
1172 mutex_unlock(&c->lp_mutex);
1173 err = lpt_gc(c);
1174 if (err)
1175 return err;
1176 mutex_lock(&c->lp_mutex);
1177 }
1178 }
1179
1180 lpt_tgc_start(c);
1181
1182 if (!c->dirty_pn_cnt) {
1183 dbg_cmt("no cnodes to commit");
1184 err = 0;
1185 goto out;
1186 }
1187
1188 if (!c->big_lpt && need_write_all(c)) {
1189 /* If needed, write everything */
1190 err = make_tree_dirty(c);
1191 if (err)
1192 goto out;
1193 lpt_tgc_start(c);
1194 }
1195
1196 if (c->big_lpt)
1197 populate_lsave(c);
1198
1199 cnt = get_cnodes_to_commit(c);
1200 ubifs_assert(cnt != 0);
1201
1202 err = layout_cnodes(c);
1203 if (err)
1204 goto out;
1205
1206 /* Copy the LPT's own lprops for end commit to write */
1207 memcpy(c->ltab_cmt, c->ltab,
1208 sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
1209 c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
1210
1211out:
1212 mutex_unlock(&c->lp_mutex);
1213 return err;
1214}
1215
1216/**
1217 * free_obsolete_cnodes - free obsolete cnodes for commit end.
1218 * @c: UBIFS file-system description object
1219 */
1220static void free_obsolete_cnodes(struct ubifs_info *c)
1221{
1222 struct ubifs_cnode *cnode, *cnext;
1223
1224 cnext = c->lpt_cnext;
1225 if (!cnext)
1226 return;
1227 do {
1228 cnode = cnext;
1229 cnext = cnode->cnext;
1230 if (test_bit(OBSOLETE_CNODE, &cnode->flags))
1231 kfree(cnode);
1232 else
1233 cnode->cnext = NULL;
1234 } while (cnext != c->lpt_cnext);
1235 c->lpt_cnext = NULL;
1236}
1237
1238/**
1239 * ubifs_lpt_end_commit - finish the commit operation.
1240 * @c: the UBIFS file-system description object
1241 *
1242 * This function has to be called when the commit operation finishes. It
1243 * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
1244 * the media. Returns zero in case of success and a negative error code in case
1245 * of failure.
1246 */
1247int ubifs_lpt_end_commit(struct ubifs_info *c)
1248{
1249 int err;
1250
1251 dbg_lp("");
1252
1253 if (!c->lpt_cnext)
1254 return 0;
1255
1256 err = write_cnodes(c);
1257 if (err)
1258 return err;
1259
1260 mutex_lock(&c->lp_mutex);
1261 free_obsolete_cnodes(c);
1262 mutex_unlock(&c->lp_mutex);
1263
1264 return 0;
1265}
1266
1267/**
1268 * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
1269 * @c: UBIFS file-system description object
1270 *
1271 * LPT trivial GC is completed after a commit. Also LPT GC is done after a
1272 * commit for the "big" LPT model.
1273 */
1274int ubifs_lpt_post_commit(struct ubifs_info *c)
1275{
1276 int err;
1277
1278 mutex_lock(&c->lp_mutex);
1279 err = lpt_tgc_end(c);
1280 if (err)
1281 goto out;
1282 if (c->big_lpt)
1283 while (need_write_all(c)) {
1284 mutex_unlock(&c->lp_mutex);
1285 err = lpt_gc(c);
1286 if (err)
1287 return err;
1288 mutex_lock(&c->lp_mutex);
1289 }
1290out:
1291 mutex_unlock(&c->lp_mutex);
1292 return err;
1293}
1294
1295/**
1296 * first_nnode - find the first nnode in memory.
1297 * @c: UBIFS file-system description object
1298 * @hght: height of tree where nnode found is returned here
1299 *
1300 * This function returns a pointer to the nnode found or %NULL if no nnode is
1301 * found. This function is a helper to 'ubifs_lpt_free()'.
1302 */
1303static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
1304{
1305 struct ubifs_nnode *nnode;
1306 int h, i, found;
1307
1308 nnode = c->nroot;
1309 *hght = 0;
1310 if (!nnode)
1311 return NULL;
1312 for (h = 1; h < c->lpt_hght; h++) {
1313 found = 0;
1314 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1315 if (nnode->nbranch[i].nnode) {
1316 found = 1;
1317 nnode = nnode->nbranch[i].nnode;
1318 *hght = h;
1319 break;
1320 }
1321 }
1322 if (!found)
1323 break;
1324 }
1325 return nnode;
1326}
1327
1328/**
1329 * next_nnode - find the next nnode in memory.
1330 * @c: UBIFS file-system description object
1331 * @nnode: nnode from which to start.
1332 * @hght: height of tree where nnode is, is passed and returned here
1333 *
1334 * This function returns a pointer to the nnode found or %NULL if no nnode is
1335 * found. This function is a helper to 'ubifs_lpt_free()'.
1336 */
1337static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
1338 struct ubifs_nnode *nnode, int *hght)
1339{
1340 struct ubifs_nnode *parent;
1341 int iip, h, i, found;
1342
1343 parent = nnode->parent;
1344 if (!parent)
1345 return NULL;
1346 if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
1347 *hght -= 1;
1348 return parent;
1349 }
1350 for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
1351 nnode = parent->nbranch[iip].nnode;
1352 if (nnode)
1353 break;
1354 }
1355 if (!nnode) {
1356 *hght -= 1;
1357 return parent;
1358 }
1359 for (h = *hght + 1; h < c->lpt_hght; h++) {
1360 found = 0;
1361 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1362 if (nnode->nbranch[i].nnode) {
1363 found = 1;
1364 nnode = nnode->nbranch[i].nnode;
1365 *hght = h;
1366 break;
1367 }
1368 }
1369 if (!found)
1370 break;
1371 }
1372 return nnode;
1373}
1374
1375/**
1376 * ubifs_lpt_free - free resources owned by the LPT.
1377 * @c: UBIFS file-system description object
1378 * @wr_only: free only resources used for writing
1379 */
1380void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
1381{
1382 struct ubifs_nnode *nnode;
1383 int i, hght;
1384
1385 /* Free write-only things first */
1386
1387 free_obsolete_cnodes(c); /* Leftover from a failed commit */
1388
1389 vfree(c->ltab_cmt);
1390 c->ltab_cmt = NULL;
1391 vfree(c->lpt_buf);
1392 c->lpt_buf = NULL;
1393 kfree(c->lsave);
1394 c->lsave = NULL;
1395
1396 if (wr_only)
1397 return;
1398
1399 /* Now free the rest */
1400
1401 nnode = first_nnode(c, &hght);
1402 while (nnode) {
1403 for (i = 0; i < UBIFS_LPT_FANOUT; i++)
1404 kfree(nnode->nbranch[i].nnode);
1405 nnode = next_nnode(c, nnode, &hght);
1406 }
1407 for (i = 0; i < LPROPS_HEAP_CNT; i++)
1408 kfree(c->lpt_heap[i].arr);
1409 kfree(c->dirty_idx.arr);
1410 kfree(c->nroot);
1411 vfree(c->ltab);
1412 kfree(c->lpt_nod_buf);
1413}
1414
1415#ifdef CONFIG_UBIFS_FS_DEBUG
1416
1417/**
1418 * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
1419 * @buf: buffer
1420 * @len: buffer length
1421 */
1422static int dbg_is_all_ff(uint8_t *buf, int len)
1423{
1424 int i;
1425
1426 for (i = 0; i < len; i++)
1427 if (buf[i] != 0xff)
1428 return 0;
1429 return 1;
1430}
1431
1432/**
1433 * dbg_is_nnode_dirty - determine if a nnode is dirty.
1434 * @c: the UBIFS file-system description object
1435 * @lnum: LEB number where nnode was written
1436 * @offs: offset where nnode was written
1437 */
1438static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
1439{
1440 struct ubifs_nnode *nnode;
1441 int hght;
1442
1443 /* Entire tree is in memory so first_nnode / next_nnode are ok */
1444 nnode = first_nnode(c, &hght);
1445 for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
1446 struct ubifs_nbranch *branch;
1447
1448 cond_resched();
1449 if (nnode->parent) {
1450 branch = &nnode->parent->nbranch[nnode->iip];
1451 if (branch->lnum != lnum || branch->offs != offs)
1452 continue;
1453 if (test_bit(DIRTY_CNODE, &nnode->flags))
1454 return 1;
1455 return 0;
1456 } else {
1457 if (c->lpt_lnum != lnum || c->lpt_offs != offs)
1458 continue;
1459 if (test_bit(DIRTY_CNODE, &nnode->flags))
1460 return 1;
1461 return 0;
1462 }
1463 }
1464 return 1;
1465}
1466
1467/**
1468 * dbg_is_pnode_dirty - determine if a pnode is dirty.
1469 * @c: the UBIFS file-system description object
1470 * @lnum: LEB number where pnode was written
1471 * @offs: offset where pnode was written
1472 */
1473static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
1474{
1475 int i, cnt;
1476
1477 cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
1478 for (i = 0; i < cnt; i++) {
1479 struct ubifs_pnode *pnode;
1480 struct ubifs_nbranch *branch;
1481
1482 cond_resched();
1483 pnode = pnode_lookup(c, i);
1484 if (IS_ERR(pnode))
1485 return PTR_ERR(pnode);
1486 branch = &pnode->parent->nbranch[pnode->iip];
1487 if (branch->lnum != lnum || branch->offs != offs)
1488 continue;
1489 if (test_bit(DIRTY_CNODE, &pnode->flags))
1490 return 1;
1491 return 0;
1492 }
1493 return 1;
1494}
1495
1496/**
1497 * dbg_is_ltab_dirty - determine if a ltab node is dirty.
1498 * @c: the UBIFS file-system description object
1499 * @lnum: LEB number where ltab node was written
1500 * @offs: offset where ltab node was written
1501 */
1502static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
1503{
1504 if (lnum != c->ltab_lnum || offs != c->ltab_offs)
1505 return 1;
1506 return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
1507}
1508
1509/**
1510 * dbg_is_lsave_dirty - determine if a lsave node is dirty.
1511 * @c: the UBIFS file-system description object
1512 * @lnum: LEB number where lsave node was written
1513 * @offs: offset where lsave node was written
1514 */
1515static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
1516{
1517 if (lnum != c->lsave_lnum || offs != c->lsave_offs)
1518 return 1;
1519 return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
1520}
1521
1522/**
1523 * dbg_is_node_dirty - determine if a node is dirty.
1524 * @c: the UBIFS file-system description object
1525 * @node_type: node type
1526 * @lnum: LEB number where node was written
1527 * @offs: offset where node was written
1528 */
1529static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
1530 int offs)
1531{
1532 switch (node_type) {
1533 case UBIFS_LPT_NNODE:
1534 return dbg_is_nnode_dirty(c, lnum, offs);
1535 case UBIFS_LPT_PNODE:
1536 return dbg_is_pnode_dirty(c, lnum, offs);
1537 case UBIFS_LPT_LTAB:
1538 return dbg_is_ltab_dirty(c, lnum, offs);
1539 case UBIFS_LPT_LSAVE:
1540 return dbg_is_lsave_dirty(c, lnum, offs);
1541 }
1542 return 1;
1543}
1544
1545/**
1546 * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
1547 * @c: the UBIFS file-system description object
1548 * @lnum: LEB number where node was written
1549 * @offs: offset where node was written
1550 *
1551 * This function returns %0 on success and a negative error code on failure.
1552 */
1553static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1554{
1555 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
1556 int ret;
1557 void *buf = c->dbg_buf;
1558
1559 dbg_lp("LEB %d", lnum);
1560 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1561 if (err) {
1562 dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
1563 return err;
1564 }
1565 while (1) {
1566 if (!is_a_node(c, buf, len)) {
1567 int i, pad_len;
1568
1569 pad_len = get_pad_len(c, buf, len);
1570 if (pad_len) {
1571 buf += pad_len;
1572 len -= pad_len;
1573 dirty += pad_len;
1574 continue;
1575 }
1576 if (!dbg_is_all_ff(buf, len)) {
1577 dbg_msg("invalid empty space in LEB %d at %d",
1578 lnum, c->leb_size - len);
1579 err = -EINVAL;
1580 }
1581 i = lnum - c->lpt_first;
1582 if (len != c->ltab[i].free) {
1583 dbg_msg("invalid free space in LEB %d "
1584 "(free %d, expected %d)",
1585 lnum, len, c->ltab[i].free);
1586 err = -EINVAL;
1587 }
1588 if (dirty != c->ltab[i].dirty) {
1589 dbg_msg("invalid dirty space in LEB %d "
1590 "(dirty %d, expected %d)",
1591 lnum, dirty, c->ltab[i].dirty);
1592 err = -EINVAL;
1593 }
1594 return err;
1595 }
1596 node_type = get_lpt_node_type(c, buf, &node_num);
1597 node_len = get_lpt_node_len(c, node_type);
1598 ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
1599 if (ret == 1)
1600 dirty += node_len;
1601 buf += node_len;
1602 len -= node_len;
1603 }
1604}
1605
1606/**
1607 * dbg_check_ltab - check the free and dirty space in the ltab.
1608 * @c: the UBIFS file-system description object
1609 *
1610 * This function returns %0 on success and a negative error code on failure.
1611 */
1612int dbg_check_ltab(struct ubifs_info *c)
1613{
1614 int lnum, err, i, cnt;
1615
1616 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1617 return 0;
1618
1619 /* Bring the entire tree into memory */
1620 cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
1621 for (i = 0; i < cnt; i++) {
1622 struct ubifs_pnode *pnode;
1623
1624 pnode = pnode_lookup(c, i);
1625 if (IS_ERR(pnode))
1626 return PTR_ERR(pnode);
1627 cond_resched();
1628 }
1629
1630 /* Check nodes */
1631 err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
1632 if (err)
1633 return err;
1634
1635 /* Check each LEB */
1636 for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
1637 err = dbg_check_ltab_lnum(c, lnum);
1638 if (err) {
1639 dbg_err("failed at LEB %d", lnum);
1640 return err;
1641 }
1642 }
1643
1644 dbg_lp("succeeded");
1645 return 0;
1646}
1647
1648#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
new file mode 100644
index 000000000000..71d5493bf565
--- /dev/null
+++ b/fs/ubifs/master.c
@@ -0,0 +1,387 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/* This file implements reading and writing the master node */
24
25#include "ubifs.h"
26
27/**
28 * scan_for_master - search the valid master node.
29 * @c: UBIFS file-system description object
30 *
31 * This function scans the master node LEBs and search for the latest master
32 * node. Returns zero in case of success and a negative error code in case of
33 * failure.
34 */
35static int scan_for_master(struct ubifs_info *c)
36{
37 struct ubifs_scan_leb *sleb;
38 struct ubifs_scan_node *snod;
39 int lnum, offs = 0, nodes_cnt;
40
41 lnum = UBIFS_MST_LNUM;
42
43 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
44 if (IS_ERR(sleb))
45 return PTR_ERR(sleb);
46 nodes_cnt = sleb->nodes_cnt;
47 if (nodes_cnt > 0) {
48 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
49 list);
50 if (snod->type != UBIFS_MST_NODE)
51 goto out;
52 memcpy(c->mst_node, snod->node, snod->len);
53 offs = snod->offs;
54 }
55 ubifs_scan_destroy(sleb);
56
57 lnum += 1;
58
59 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
60 if (IS_ERR(sleb))
61 return PTR_ERR(sleb);
62 if (sleb->nodes_cnt != nodes_cnt)
63 goto out;
64 if (!sleb->nodes_cnt)
65 goto out;
66 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
67 if (snod->type != UBIFS_MST_NODE)
68 goto out;
69 if (snod->offs != offs)
70 goto out;
71 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
72 (void *)snod->node + UBIFS_CH_SZ,
73 UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
74 goto out;
75 c->mst_offs = offs;
76 ubifs_scan_destroy(sleb);
77 return 0;
78
79out:
80 ubifs_scan_destroy(sleb);
81 return -EINVAL;
82}
83
84/**
85 * validate_master - validate master node.
86 * @c: UBIFS file-system description object
87 *
88 * This function validates data which was read from master node. Returns zero
89 * if the data is all right and %-EINVAL if not.
90 */
91static int validate_master(const struct ubifs_info *c)
92{
93 long long main_sz;
94 int err;
95
96 if (c->max_sqnum >= SQNUM_WATERMARK) {
97 err = 1;
98 goto out;
99 }
100
101 if (c->cmt_no >= c->max_sqnum) {
102 err = 2;
103 goto out;
104 }
105
106 if (c->highest_inum >= INUM_WATERMARK) {
107 err = 3;
108 goto out;
109 }
110
111 if (c->lhead_lnum < UBIFS_LOG_LNUM ||
112 c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
113 c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
114 c->lhead_offs & (c->min_io_size - 1)) {
115 err = 4;
116 goto out;
117 }
118
119 if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
120 c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
121 err = 5;
122 goto out;
123 }
124
125 if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
126 c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
127 err = 6;
128 goto out;
129 }
130
131 if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
132 err = 7;
133 goto out;
134 }
135
136 if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
137 c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
138 c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
139 err = 8;
140 goto out;
141 }
142
143 main_sz = (long long)c->main_lebs * c->leb_size;
144 if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
145 err = 9;
146 goto out;
147 }
148
149 if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
150 c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
151 err = 10;
152 goto out;
153 }
154
155 if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
156 c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
157 c->nhead_offs > c->leb_size) {
158 err = 11;
159 goto out;
160 }
161
162 if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
163 c->ltab_offs < 0 ||
164 c->ltab_offs + c->ltab_sz > c->leb_size) {
165 err = 12;
166 goto out;
167 }
168
169 if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
170 c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
171 c->lsave_offs + c->lsave_sz > c->leb_size)) {
172 err = 13;
173 goto out;
174 }
175
176 if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
177 err = 14;
178 goto out;
179 }
180
181 if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
182 err = 15;
183 goto out;
184 }
185
186 if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
187 err = 16;
188 goto out;
189 }
190
191 if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
192 c->lst.total_free & 7) {
193 err = 17;
194 goto out;
195 }
196
197 if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
198 err = 18;
199 goto out;
200 }
201
202 if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
203 err = 19;
204 goto out;
205 }
206
207 if (c->lst.total_free + c->lst.total_dirty +
208 c->lst.total_used > main_sz) {
209 err = 20;
210 goto out;
211 }
212
213 if (c->lst.total_dead + c->lst.total_dark +
214 c->lst.total_used + c->old_idx_sz > main_sz) {
215 err = 21;
216 goto out;
217 }
218
219 if (c->lst.total_dead < 0 ||
220 c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
221 c->lst.total_dead & 7) {
222 err = 22;
223 goto out;
224 }
225
226 if (c->lst.total_dark < 0 ||
227 c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
228 c->lst.total_dark & 7) {
229 err = 23;
230 goto out;
231 }
232
233 return 0;
234
235out:
236 ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
237 dbg_dump_node(c, c->mst_node);
238 return -EINVAL;
239}
240
241/**
242 * ubifs_read_master - read master node.
243 * @c: UBIFS file-system description object
244 *
245 * This function finds and reads the master node during file-system mount. If
246 * the flash is empty, it creates default master node as well. Returns zero in
247 * case of success and a negative error code in case of failure.
248 */
249int ubifs_read_master(struct ubifs_info *c)
250{
251 int err, old_leb_cnt;
252
253 c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
254 if (!c->mst_node)
255 return -ENOMEM;
256
257 err = scan_for_master(c);
258 if (err) {
259 err = ubifs_recover_master_node(c);
260 if (err)
261 /*
262 * Note, we do not free 'c->mst_node' here because the
263 * unmount routine will take care of this.
264 */
265 return err;
266 }
267
268 /* Make sure that the recovery flag is clear */
269 c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
270
271 c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum);
272 c->highest_inum = le64_to_cpu(c->mst_node->highest_inum);
273 c->cmt_no = le64_to_cpu(c->mst_node->cmt_no);
274 c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum);
275 c->zroot.offs = le32_to_cpu(c->mst_node->root_offs);
276 c->zroot.len = le32_to_cpu(c->mst_node->root_len);
277 c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum);
278 c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
279 c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
280 c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
281 c->old_idx_sz = le64_to_cpu(c->mst_node->index_size);
282 c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
283 c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
284 c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
285 c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs);
286 c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum);
287 c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs);
288 c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum);
289 c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs);
290 c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum);
291 c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs);
292 c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs);
293 old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt);
294 c->lst.total_free = le64_to_cpu(c->mst_node->total_free);
295 c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
296 c->lst.total_used = le64_to_cpu(c->mst_node->total_used);
297 c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
298 c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
299
300 c->calc_idx_sz = c->old_idx_sz;
301
302 if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
303 c->no_orphs = 1;
304
305 if (old_leb_cnt != c->leb_cnt) {
306 /* The file system has been resized */
307 int growth = c->leb_cnt - old_leb_cnt;
308
309 if (c->leb_cnt < old_leb_cnt ||
310 c->leb_cnt < UBIFS_MIN_LEB_CNT) {
311 ubifs_err("bad leb_cnt on master node");
312 dbg_dump_node(c, c->mst_node);
313 return -EINVAL;
314 }
315
316 dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
317 old_leb_cnt, c->leb_cnt);
318 c->lst.empty_lebs += growth;
319 c->lst.total_free += growth * (long long)c->leb_size;
320 c->lst.total_dark += growth * (long long)c->dark_wm;
321
322 /*
323 * Reflect changes back onto the master node. N.B. the master
324 * node gets written immediately whenever mounting (or
325 * remounting) in read-write mode, so we do not need to write it
326 * here.
327 */
328 c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
329 c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
330 c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
331 c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
332 }
333
334 err = validate_master(c);
335 if (err)
336 return err;
337
338 err = dbg_old_index_check_init(c, &c->zroot);
339
340 return err;
341}
342
343/**
344 * ubifs_write_master - write master node.
345 * @c: UBIFS file-system description object
346 *
347 * This function writes the master node. The caller has to take the
348 * @c->mst_mutex lock before calling this function. Returns zero in case of
349 * success and a negative error code in case of failure. The master node is
350 * written twice to enable recovery.
351 */
352int ubifs_write_master(struct ubifs_info *c)
353{
354 int err, lnum, offs, len;
355
356 if (c->ro_media)
357 return -EINVAL;
358
359 lnum = UBIFS_MST_LNUM;
360 offs = c->mst_offs + c->mst_node_alsz;
361 len = UBIFS_MST_NODE_SZ;
362
363 if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
364 err = ubifs_leb_unmap(c, lnum);
365 if (err)
366 return err;
367 offs = 0;
368 }
369
370 c->mst_offs = offs;
371 c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
372
373 err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
374 if (err)
375 return err;
376
377 lnum += 1;
378
379 if (offs == 0) {
380 err = ubifs_leb_unmap(c, lnum);
381 if (err)
382 return err;
383 }
384 err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
385
386 return err;
387}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
new file mode 100644
index 000000000000..4beccfc256d2
--- /dev/null
+++ b/fs/ubifs/misc.h
@@ -0,0 +1,342 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file contains miscellaneous helper functions.
25 */
26
27#ifndef __UBIFS_MISC_H__
28#define __UBIFS_MISC_H__
29
30/**
31 * ubifs_zn_dirty - check if znode is dirty.
32 * @znode: znode to check
33 *
34 * This helper function returns %1 if @znode is dirty and %0 otherwise.
35 */
36static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
37{
38 return !!test_bit(DIRTY_ZNODE, &znode->flags);
39}
40
41/**
42 * ubifs_wake_up_bgt - wake up background thread.
43 * @c: UBIFS file-system description object
44 */
45static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
46{
47 if (c->bgt && !c->need_bgt) {
48 c->need_bgt = 1;
49 wake_up_process(c->bgt);
50 }
51}
52
53/**
54 * ubifs_tnc_find_child - find next child in znode.
55 * @znode: znode to search at
56 * @start: the zbranch index to start at
57 *
58 * This helper function looks for znode child starting at index @start. Returns
59 * the child or %NULL if no children were found.
60 */
61static inline struct ubifs_znode *
62ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
63{
64 while (start < znode->child_cnt) {
65 if (znode->zbranch[start].znode)
66 return znode->zbranch[start].znode;
67 start += 1;
68 }
69
70 return NULL;
71}
72
73/**
74 * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
75 * @inode: the VFS 'struct inode' pointer
76 */
77static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
78{
79 return container_of(inode, struct ubifs_inode, vfs_inode);
80}
81
82/**
83 * ubifs_ro_mode - switch UBIFS to read read-only mode.
84 * @c: UBIFS file-system description object
85 * @err: error code which is the reason of switching to R/O mode
86 */
87static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
88{
89 if (!c->ro_media) {
90 c->ro_media = 1;
91 ubifs_warn("switched to read-only mode, error %d", err);
92 dbg_dump_stack();
93 }
94}
95
96/**
97 * ubifs_compr_present - check if compressor was compiled in.
98 * @compr_type: compressor type to check
99 *
100 * This function returns %1 of compressor of type @compr_type is present, and
101 * %0 if not.
102 */
103static inline int ubifs_compr_present(int compr_type)
104{
105 ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
106 return !!ubifs_compressors[compr_type]->capi_name;
107}
108
109/**
110 * ubifs_compr_name - get compressor name string by its type.
111 * @compr_type: compressor type
112 *
113 * This function returns compressor type string.
114 */
115static inline const char *ubifs_compr_name(int compr_type)
116{
117 ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
118 return ubifs_compressors[compr_type]->name;
119}
120
121/**
122 * ubifs_wbuf_sync - synchronize write-buffer.
123 * @wbuf: write-buffer to synchronize
124 *
125 * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
126 * that the write-buffer is already locked.
127 */
128static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
129{
130 int err;
131
132 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
133 err = ubifs_wbuf_sync_nolock(wbuf);
134 mutex_unlock(&wbuf->io_mutex);
135 return err;
136}
137
138/**
139 * ubifs_leb_unmap - unmap an LEB.
140 * @c: UBIFS file-system description object
141 * @lnum: LEB number to unmap
142 *
143 * This function returns %0 on success and a negative error code on failure.
144 */
145static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
146{
147 int err;
148
149 if (c->ro_media)
150 return -EROFS;
151 err = ubi_leb_unmap(c->ubi, lnum);
152 if (err) {
153 ubifs_err("unmap LEB %d failed, error %d", lnum, err);
154 return err;
155 }
156
157 return 0;
158}
159
160/**
161 * ubifs_leb_write - write to a LEB.
162 * @c: UBIFS file-system description object
163 * @lnum: LEB number to write
164 * @buf: buffer to write from
165 * @offs: offset within LEB to write to
166 * @len: length to write
167 * @dtype: data type
168 *
169 * This function returns %0 on success and a negative error code on failure.
170 */
171static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
172 const void *buf, int offs, int len, int dtype)
173{
174 int err;
175
176 if (c->ro_media)
177 return -EROFS;
178 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
179 if (err) {
180 ubifs_err("writing %d bytes at %d:%d, error %d",
181 len, lnum, offs, err);
182 return err;
183 }
184
185 return 0;
186}
187
188/**
189 * ubifs_leb_change - atomic LEB change.
190 * @c: UBIFS file-system description object
191 * @lnum: LEB number to write
192 * @buf: buffer to write from
193 * @len: length to write
194 * @dtype: data type
195 *
196 * This function returns %0 on success and a negative error code on failure.
197 */
198static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
199 const void *buf, int len, int dtype)
200{
201 int err;
202
203 if (c->ro_media)
204 return -EROFS;
205 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
206 if (err) {
207 ubifs_err("changing %d bytes in LEB %d, error %d",
208 len, lnum, err);
209 return err;
210 }
211
212 return 0;
213}
214
215/**
216 * ubifs_encode_dev - encode device node IDs.
217 * @dev: UBIFS device node information
218 * @rdev: device IDs to encode
219 *
220 * This is a helper function which encodes major/minor numbers of a device node
221 * into UBIFS device node description. We use standard Linux "new" and "huge"
222 * encodings.
223 */
224static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
225{
226 if (new_valid_dev(rdev)) {
227 dev->new = cpu_to_le32(new_encode_dev(rdev));
228 return sizeof(dev->new);
229 } else {
230 dev->huge = cpu_to_le64(huge_encode_dev(rdev));
231 return sizeof(dev->huge);
232 }
233}
234
235/**
236 * ubifs_add_dirt - add dirty space to LEB properties.
237 * @c: the UBIFS file-system description object
238 * @lnum: LEB to add dirty space for
239 * @dirty: dirty space to add
240 *
241 * This is a helper function which increased amount of dirty LEB space. Returns
242 * zero in case of success and a negative error code in case of failure.
243 */
244static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
245{
246 return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
247}
248
249/**
250 * ubifs_return_leb - return LEB to lprops.
251 * @c: the UBIFS file-system description object
252 * @lnum: LEB to return
253 *
254 * This helper function cleans the "taken" flag of a logical eraseblock in the
255 * lprops. Returns zero in case of success and a negative error code in case of
256 * failure.
257 */
258static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
259{
260 return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
261 LPROPS_TAKEN, 0);
262}
263
264/**
265 * ubifs_idx_node_sz - return index node size.
266 * @c: the UBIFS file-system description object
267 * @child_cnt: number of children of this index node
268 */
269static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
270{
271 return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
272}
273
274/**
275 * ubifs_idx_branch - return pointer to an index branch.
276 * @c: the UBIFS file-system description object
277 * @idx: index node
278 * @bnum: branch number
279 */
280static inline
281struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
282 const struct ubifs_idx_node *idx,
283 int bnum)
284{
285 return (struct ubifs_branch *)((void *)idx->branches +
286 (UBIFS_BRANCH_SZ + c->key_len) * bnum);
287}
288
289/**
290 * ubifs_idx_key - return pointer to an index key.
291 * @c: the UBIFS file-system description object
292 * @idx: index node
293 */
294static inline void *ubifs_idx_key(const struct ubifs_info *c,
295 const struct ubifs_idx_node *idx)
296{
297 return (void *)((struct ubifs_branch *)idx->branches)->key;
298}
299
300/**
301 * ubifs_reported_space - calculate reported free space.
302 * @c: the UBIFS file-system description object
303 * @free: amount of free space
304 *
305 * This function calculates amount of free space which will be reported to
306 * user-space. User-space application tend to expect that if the file-system
307 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
308 * are able to write a file of size N. UBIFS attaches node headers to each data
309 * node and it has to write indexind nodes as well. This introduces additional
310 * overhead, and UBIFS it has to report sligtly less free space to meet the
311 * above expectetion.
312 *
313 * This function assumes free space is made up of uncompressed data nodes and
314 * full index nodes (one per data node, doubled because we always allow enough
315 * space to write the index twice).
316 *
317 * Note, the calculation is pessimistic, which means that most of the time
318 * UBIFS reports less space than it actually has.
319 */
320static inline long long ubifs_reported_space(const struct ubifs_info *c,
321 uint64_t free)
322{
323 int divisor, factor;
324
325 divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
326 factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
327 do_div(free, divisor);
328
329 return free * factor;
330}
331
332/**
333 * ubifs_current_time - round current time to time granularity.
334 * @inode: inode
335 */
336static inline struct timespec ubifs_current_time(struct inode *inode)
337{
338 return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
339 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
340}
341
342#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
new file mode 100644
index 000000000000..3afeb9242c6a
--- /dev/null
+++ b/fs/ubifs/orphan.c
@@ -0,0 +1,958 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Author: Adrian Hunter
20 */
21
22#include "ubifs.h"
23
24/*
25 * An orphan is an inode number whose inode node has been committed to the index
26 * with a link count of zero. That happens when an open file is deleted
27 * (unlinked) and then a commit is run. In the normal course of events the inode
28 * would be deleted when the file is closed. However in the case of an unclean
29 * unmount, orphans need to be accounted for. After an unclean unmount, the
30 * orphans' inodes must be deleted which means either scanning the entire index
31 * looking for them, or keeping a list on flash somewhere. This unit implements
32 * the latter approach.
33 *
34 * The orphan area is a fixed number of LEBs situated between the LPT area and
35 * the main area. The number of orphan area LEBs is specified when the file
36 * system is created. The minimum number is 1. The size of the orphan area
37 * should be so that it can hold the maximum number of orphans that are expected
38 * to ever exist at one time.
39 *
40 * The number of orphans that can fit in a LEB is:
41 *
42 * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
43 *
44 * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
45 *
46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to
47 * zero, the inode number is added to the rb-tree. It is removed from the tree
48 * when the inode is deleted. Any new orphans that are in the orphan tree when
49 * the commit is run, are written to the orphan area in 1 or more orph nodes.
50 * If the orphan area is full, it is consolidated to make space. There is
51 * always enough space because validation prevents the user from creating more
52 * than the maximum number of orphans allowed.
53 */
54
55#ifdef CONFIG_UBIFS_FS_DEBUG
56static int dbg_check_orphans(struct ubifs_info *c);
57#else
58#define dbg_check_orphans(c) 0
59#endif
60
61/**
62 * ubifs_add_orphan - add an orphan.
63 * @c: UBIFS file-system description object
64 * @inum: orphan inode number
65 *
66 * Add an orphan. This function is called when an inodes link count drops to
67 * zero.
68 */
69int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
70{
71 struct ubifs_orphan *orphan, *o;
72 struct rb_node **p, *parent = NULL;
73
74 orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
75 if (!orphan)
76 return -ENOMEM;
77 orphan->inum = inum;
78 orphan->new = 1;
79
80 spin_lock(&c->orphan_lock);
81 if (c->tot_orphans >= c->max_orphans) {
82 spin_unlock(&c->orphan_lock);
83 kfree(orphan);
84 return -ENFILE;
85 }
86 p = &c->orph_tree.rb_node;
87 while (*p) {
88 parent = *p;
89 o = rb_entry(parent, struct ubifs_orphan, rb);
90 if (inum < o->inum)
91 p = &(*p)->rb_left;
92 else if (inum > o->inum)
93 p = &(*p)->rb_right;
94 else {
95 dbg_err("orphaned twice");
96 spin_unlock(&c->orphan_lock);
97 kfree(orphan);
98 return 0;
99 }
100 }
101 c->tot_orphans += 1;
102 c->new_orphans += 1;
103 rb_link_node(&orphan->rb, parent, p);
104 rb_insert_color(&orphan->rb, &c->orph_tree);
105 list_add_tail(&orphan->list, &c->orph_list);
106 list_add_tail(&orphan->new_list, &c->orph_new);
107 spin_unlock(&c->orphan_lock);
108 dbg_gen("ino %lu", inum);
109 return 0;
110}
111
112/**
113 * ubifs_delete_orphan - delete an orphan.
114 * @c: UBIFS file-system description object
115 * @inum: orphan inode number
116 *
117 * Delete an orphan. This function is called when an inode is deleted.
118 */
119void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
120{
121 struct ubifs_orphan *o;
122 struct rb_node *p;
123
124 spin_lock(&c->orphan_lock);
125 p = c->orph_tree.rb_node;
126 while (p) {
127 o = rb_entry(p, struct ubifs_orphan, rb);
128 if (inum < o->inum)
129 p = p->rb_left;
130 else if (inum > o->inum)
131 p = p->rb_right;
132 else {
133 if (o->dnext) {
134 spin_unlock(&c->orphan_lock);
135 dbg_gen("deleted twice ino %lu", inum);
136 return;
137 }
138 if (o->cnext) {
139 o->dnext = c->orph_dnext;
140 c->orph_dnext = o;
141 spin_unlock(&c->orphan_lock);
142 dbg_gen("delete later ino %lu", inum);
143 return;
144 }
145 rb_erase(p, &c->orph_tree);
146 list_del(&o->list);
147 c->tot_orphans -= 1;
148 if (o->new) {
149 list_del(&o->new_list);
150 c->new_orphans -= 1;
151 }
152 spin_unlock(&c->orphan_lock);
153 kfree(o);
154 dbg_gen("inum %lu", inum);
155 return;
156 }
157 }
158 spin_unlock(&c->orphan_lock);
159 dbg_err("missing orphan ino %lu", inum);
160 dbg_dump_stack();
161}
162
163/**
164 * ubifs_orphan_start_commit - start commit of orphans.
165 * @c: UBIFS file-system description object
166 *
167 * Start commit of orphans.
168 */
169int ubifs_orphan_start_commit(struct ubifs_info *c)
170{
171 struct ubifs_orphan *orphan, **last;
172
173 spin_lock(&c->orphan_lock);
174 last = &c->orph_cnext;
175 list_for_each_entry(orphan, &c->orph_new, new_list) {
176 ubifs_assert(orphan->new);
177 orphan->new = 0;
178 *last = orphan;
179 last = &orphan->cnext;
180 }
181 *last = orphan->cnext;
182 c->cmt_orphans = c->new_orphans;
183 c->new_orphans = 0;
184 dbg_cmt("%d orphans to commit", c->cmt_orphans);
185 INIT_LIST_HEAD(&c->orph_new);
186 if (c->tot_orphans == 0)
187 c->no_orphs = 1;
188 else
189 c->no_orphs = 0;
190 spin_unlock(&c->orphan_lock);
191 return 0;
192}
193
194/**
195 * avail_orphs - calculate available space.
196 * @c: UBIFS file-system description object
197 *
198 * This function returns the number of orphans that can be written in the
199 * available space.
200 */
201static int avail_orphs(struct ubifs_info *c)
202{
203 int avail_lebs, avail, gap;
204
205 avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
206 avail = avail_lebs *
207 ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
208 gap = c->leb_size - c->ohead_offs;
209 if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
210 avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
211 return avail;
212}
213
214/**
215 * tot_avail_orphs - calculate total space.
216 * @c: UBIFS file-system description object
217 *
218 * This function returns the number of orphans that can be written in half
219 * the total space. That leaves half the space for adding new orphans.
220 */
221static int tot_avail_orphs(struct ubifs_info *c)
222{
223 int avail_lebs, avail;
224
225 avail_lebs = c->orph_lebs;
226 avail = avail_lebs *
227 ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
228 return avail / 2;
229}
230
231/**
232 * do_write_orph_node - write a node
233 * @c: UBIFS file-system description object
234 * @len: length of node
235 * @atomic: write atomically
236 *
237 * This function writes a node to the orphan head from the orphan buffer. If
238 * %atomic is not zero, then the write is done atomically. On success, %0 is
239 * returned, otherwise a negative error code is returned.
240 */
241static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
242{
243 int err = 0;
244
245 if (atomic) {
246 ubifs_assert(c->ohead_offs == 0);
247 ubifs_prepare_node(c, c->orph_buf, len, 1);
248 len = ALIGN(len, c->min_io_size);
249 err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
250 UBI_SHORTTERM);
251 } else {
252 if (c->ohead_offs == 0) {
253 /* Ensure LEB has been unmapped */
254 err = ubifs_leb_unmap(c, c->ohead_lnum);
255 if (err)
256 return err;
257 }
258 err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
259 c->ohead_offs, UBI_SHORTTERM);
260 }
261 return err;
262}
263
264/**
265 * write_orph_node - write an orph node
266 * @c: UBIFS file-system description object
267 * @atomic: write atomically
268 *
269 * This function builds an orph node from the cnext list and writes it to the
270 * orphan head. On success, %0 is returned, otherwise a negative error code
271 * is returned.
272 */
273static int write_orph_node(struct ubifs_info *c, int atomic)
274{
275 struct ubifs_orphan *orphan, *cnext;
276 struct ubifs_orph_node *orph;
277 int gap, err, len, cnt, i;
278
279 ubifs_assert(c->cmt_orphans > 0);
280 gap = c->leb_size - c->ohead_offs;
281 if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
282 c->ohead_lnum += 1;
283 c->ohead_offs = 0;
284 gap = c->leb_size;
285 if (c->ohead_lnum > c->orph_last) {
286 /*
287 * We limit the number of orphans so that this should
288 * never happen.
289 */
290 ubifs_err("out of space in orphan area");
291 return -EINVAL;
292 }
293 }
294 cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
295 if (cnt > c->cmt_orphans)
296 cnt = c->cmt_orphans;
297 len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
298 ubifs_assert(c->orph_buf);
299 orph = c->orph_buf;
300 orph->ch.node_type = UBIFS_ORPH_NODE;
301 spin_lock(&c->orphan_lock);
302 cnext = c->orph_cnext;
303 for (i = 0; i < cnt; i++) {
304 orphan = cnext;
305 orph->inos[i] = cpu_to_le64(orphan->inum);
306 cnext = orphan->cnext;
307 orphan->cnext = NULL;
308 }
309 c->orph_cnext = cnext;
310 c->cmt_orphans -= cnt;
311 spin_unlock(&c->orphan_lock);
312 if (c->cmt_orphans)
313 orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
314 else
315 /* Mark the last node of the commit */
316 orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
317 ubifs_assert(c->ohead_offs + len <= c->leb_size);
318 ubifs_assert(c->ohead_lnum >= c->orph_first);
319 ubifs_assert(c->ohead_lnum <= c->orph_last);
320 err = do_write_orph_node(c, len, atomic);
321 c->ohead_offs += ALIGN(len, c->min_io_size);
322 c->ohead_offs = ALIGN(c->ohead_offs, 8);
323 return err;
324}
325
326/**
327 * write_orph_nodes - write orph nodes until there are no more to commit
328 * @c: UBIFS file-system description object
329 * @atomic: write atomically
330 *
331 * This function writes orph nodes for all the orphans to commit. On success,
332 * %0 is returned, otherwise a negative error code is returned.
333 */
334static int write_orph_nodes(struct ubifs_info *c, int atomic)
335{
336 int err;
337
338 while (c->cmt_orphans > 0) {
339 err = write_orph_node(c, atomic);
340 if (err)
341 return err;
342 }
343 if (atomic) {
344 int lnum;
345
346 /* Unmap any unused LEBs after consolidation */
347 lnum = c->ohead_lnum + 1;
348 for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
349 err = ubifs_leb_unmap(c, lnum);
350 if (err)
351 return err;
352 }
353 }
354 return 0;
355}
356
357/**
358 * consolidate - consolidate the orphan area.
359 * @c: UBIFS file-system description object
360 *
361 * This function enables consolidation by putting all the orphans into the list
362 * to commit. The list is in the order that the orphans were added, and the
363 * LEBs are written atomically in order, so at no time can orphans be lost by
364 * an unclean unmount.
365 *
366 * This function returns %0 on success and a negative error code on failure.
367 */
368static int consolidate(struct ubifs_info *c)
369{
370 int tot_avail = tot_avail_orphs(c), err = 0;
371
372 spin_lock(&c->orphan_lock);
373 dbg_cmt("there is space for %d orphans and there are %d",
374 tot_avail, c->tot_orphans);
375 if (c->tot_orphans - c->new_orphans <= tot_avail) {
376 struct ubifs_orphan *orphan, **last;
377 int cnt = 0;
378
379 /* Change the cnext list to include all non-new orphans */
380 last = &c->orph_cnext;
381 list_for_each_entry(orphan, &c->orph_list, list) {
382 if (orphan->new)
383 continue;
384 *last = orphan;
385 last = &orphan->cnext;
386 cnt += 1;
387 }
388 *last = orphan->cnext;
389 ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
390 c->cmt_orphans = cnt;
391 c->ohead_lnum = c->orph_first;
392 c->ohead_offs = 0;
393 } else {
394 /*
395 * We limit the number of orphans so that this should
396 * never happen.
397 */
398 ubifs_err("out of space in orphan area");
399 err = -EINVAL;
400 }
401 spin_unlock(&c->orphan_lock);
402 return err;
403}
404
405/**
406 * commit_orphans - commit orphans.
407 * @c: UBIFS file-system description object
408 *
409 * This function commits orphans to flash. On success, %0 is returned,
410 * otherwise a negative error code is returned.
411 */
412static int commit_orphans(struct ubifs_info *c)
413{
414 int avail, atomic = 0, err;
415
416 ubifs_assert(c->cmt_orphans > 0);
417 avail = avail_orphs(c);
418 if (avail < c->cmt_orphans) {
419 /* Not enough space to write new orphans, so consolidate */
420 err = consolidate(c);
421 if (err)
422 return err;
423 atomic = 1;
424 }
425 err = write_orph_nodes(c, atomic);
426 return err;
427}
428
429/**
430 * erase_deleted - erase the orphans marked for deletion.
431 * @c: UBIFS file-system description object
432 *
433 * During commit, the orphans being committed cannot be deleted, so they are
434 * marked for deletion and deleted by this function. Also, the recovery
435 * adds killed orphans to the deletion list, and therefore they are deleted
436 * here too.
437 */
438static void erase_deleted(struct ubifs_info *c)
439{
440 struct ubifs_orphan *orphan, *dnext;
441
442 spin_lock(&c->orphan_lock);
443 dnext = c->orph_dnext;
444 while (dnext) {
445 orphan = dnext;
446 dnext = orphan->dnext;
447 ubifs_assert(!orphan->new);
448 rb_erase(&orphan->rb, &c->orph_tree);
449 list_del(&orphan->list);
450 c->tot_orphans -= 1;
451 dbg_gen("deleting orphan ino %lu", orphan->inum);
452 kfree(orphan);
453 }
454 c->orph_dnext = NULL;
455 spin_unlock(&c->orphan_lock);
456}
457
458/**
459 * ubifs_orphan_end_commit - end commit of orphans.
460 * @c: UBIFS file-system description object
461 *
462 * End commit of orphans.
463 */
464int ubifs_orphan_end_commit(struct ubifs_info *c)
465{
466 int err;
467
468 if (c->cmt_orphans != 0) {
469 err = commit_orphans(c);
470 if (err)
471 return err;
472 }
473 erase_deleted(c);
474 err = dbg_check_orphans(c);
475 return err;
476}
477
478/**
479 * clear_orphans - erase all LEBs used for orphans.
480 * @c: UBIFS file-system description object
481 *
482 * If recovery is not required, then the orphans from the previous session
483 * are not needed. This function locates the LEBs used to record
484 * orphans, and un-maps them.
485 */
486static int clear_orphans(struct ubifs_info *c)
487{
488 int lnum, err;
489
490 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
491 err = ubifs_leb_unmap(c, lnum);
492 if (err)
493 return err;
494 }
495 c->ohead_lnum = c->orph_first;
496 c->ohead_offs = 0;
497 return 0;
498}
499
500/**
501 * insert_dead_orphan - insert an orphan.
502 * @c: UBIFS file-system description object
503 * @inum: orphan inode number
504 *
505 * This function is a helper to the 'do_kill_orphans()' function. The orphan
506 * must be kept until the next commit, so it is added to the rb-tree and the
507 * deletion list.
508 */
509static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
510{
511 struct ubifs_orphan *orphan, *o;
512 struct rb_node **p, *parent = NULL;
513
514 orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
515 if (!orphan)
516 return -ENOMEM;
517 orphan->inum = inum;
518
519 p = &c->orph_tree.rb_node;
520 while (*p) {
521 parent = *p;
522 o = rb_entry(parent, struct ubifs_orphan, rb);
523 if (inum < o->inum)
524 p = &(*p)->rb_left;
525 else if (inum > o->inum)
526 p = &(*p)->rb_right;
527 else {
528 /* Already added - no problem */
529 kfree(orphan);
530 return 0;
531 }
532 }
533 c->tot_orphans += 1;
534 rb_link_node(&orphan->rb, parent, p);
535 rb_insert_color(&orphan->rb, &c->orph_tree);
536 list_add_tail(&orphan->list, &c->orph_list);
537 orphan->dnext = c->orph_dnext;
538 c->orph_dnext = orphan;
539 dbg_mnt("ino %lu, new %d, tot %d",
540 inum, c->new_orphans, c->tot_orphans);
541 return 0;
542}
543
544/**
545 * do_kill_orphans - remove orphan inodes from the index.
546 * @c: UBIFS file-system description object
547 * @sleb: scanned LEB
548 * @last_cmt_no: cmt_no of last orph node read is passed and returned here
549 * @outofdate: whether the LEB is out of date is returned here
550 * @last_flagged: whether the end orph node is encountered
551 *
552 * This function is a helper to the 'kill_orphans()' function. It goes through
553 * every orphan node in a LEB and for every inode number recorded, removes
554 * all keys for that inode from the TNC.
555 */
556static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
557 unsigned long long *last_cmt_no, int *outofdate,
558 int *last_flagged)
559{
560 struct ubifs_scan_node *snod;
561 struct ubifs_orph_node *orph;
562 unsigned long long cmt_no;
563 ino_t inum;
564 int i, n, err, first = 1;
565
566 list_for_each_entry(snod, &sleb->nodes, list) {
567 if (snod->type != UBIFS_ORPH_NODE) {
568 ubifs_err("invalid node type %d in orphan area at "
569 "%d:%d", snod->type, sleb->lnum, snod->offs);
570 dbg_dump_node(c, snod->node);
571 return -EINVAL;
572 }
573
574 orph = snod->node;
575
576 /* Check commit number */
577 cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
578 /*
579 * The commit number on the master node may be less, because
580 * of a failed commit. If there are several failed commits in a
581 * row, the commit number written on orph nodes will continue to
582 * increase (because the commit number is adjusted here) even
583 * though the commit number on the master node stays the same
584 * because the master node has not been re-written.
585 */
586 if (cmt_no > c->cmt_no)
587 c->cmt_no = cmt_no;
588 if (cmt_no < *last_cmt_no && *last_flagged) {
589 /*
590 * The last orph node had a higher commit number and was
591 * flagged as the last written for that commit number.
592 * That makes this orph node, out of date.
593 */
594 if (!first) {
595 ubifs_err("out of order commit number %llu in "
596 "orphan node at %d:%d",
597 cmt_no, sleb->lnum, snod->offs);
598 dbg_dump_node(c, snod->node);
599 return -EINVAL;
600 }
601 dbg_rcvry("out of date LEB %d", sleb->lnum);
602 *outofdate = 1;
603 return 0;
604 }
605
606 if (first)
607 first = 0;
608
609 n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
610 for (i = 0; i < n; i++) {
611 inum = le64_to_cpu(orph->inos[i]);
612 dbg_rcvry("deleting orphaned inode %lu", inum);
613 err = ubifs_tnc_remove_ino(c, inum);
614 if (err)
615 return err;
616 err = insert_dead_orphan(c, inum);
617 if (err)
618 return err;
619 }
620
621 *last_cmt_no = cmt_no;
622 if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
623 dbg_rcvry("last orph node for commit %llu at %d:%d",
624 cmt_no, sleb->lnum, snod->offs);
625 *last_flagged = 1;
626 } else
627 *last_flagged = 0;
628 }
629
630 return 0;
631}
632
633/**
634 * kill_orphans - remove all orphan inodes from the index.
635 * @c: UBIFS file-system description object
636 *
637 * If recovery is required, then orphan inodes recorded during the previous
638 * session (which ended with an unclean unmount) must be deleted from the index.
639 * This is done by updating the TNC, but since the index is not updated until
640 * the next commit, the LEBs where the orphan information is recorded are not
641 * erased until the next commit.
642 */
643static int kill_orphans(struct ubifs_info *c)
644{
645 unsigned long long last_cmt_no = 0;
646 int lnum, err = 0, outofdate = 0, last_flagged = 0;
647
648 c->ohead_lnum = c->orph_first;
649 c->ohead_offs = 0;
650 /* Check no-orphans flag and skip this if no orphans */
651 if (c->no_orphs) {
652 dbg_rcvry("no orphans");
653 return 0;
654 }
655 /*
656 * Orph nodes always start at c->orph_first and are written to each
657 * successive LEB in turn. Generally unused LEBs will have been unmapped
658 * but may contain out of date orph nodes if the unmap didn't go
659 * through. In addition, the last orph node written for each commit is
660 * marked (top bit of orph->cmt_no is set to 1). It is possible that
661 * there are orph nodes from the next commit (i.e. the commit did not
662 * complete successfully). In that case, no orphans will have been lost
663 * due to the way that orphans are written, and any orphans added will
664 * be valid orphans anyway and so can be deleted.
665 */
666 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
667 struct ubifs_scan_leb *sleb;
668
669 dbg_rcvry("LEB %d", lnum);
670 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
671 if (IS_ERR(sleb)) {
672 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
673 if (IS_ERR(sleb)) {
674 err = PTR_ERR(sleb);
675 break;
676 }
677 }
678 err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
679 &last_flagged);
680 if (err || outofdate) {
681 ubifs_scan_destroy(sleb);
682 break;
683 }
684 if (sleb->endpt) {
685 c->ohead_lnum = lnum;
686 c->ohead_offs = sleb->endpt;
687 }
688 ubifs_scan_destroy(sleb);
689 }
690 return err;
691}
692
693/**
694 * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
695 * @c: UBIFS file-system description object
696 * @unclean: indicates recovery from unclean unmount
697 * @read_only: indicates read only mount
698 *
699 * This function is called when mounting to erase orphans from the previous
700 * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
701 * orphans are deleted.
702 */
703int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
704{
705 int err = 0;
706
707 c->max_orphans = tot_avail_orphs(c);
708
709 if (!read_only) {
710 c->orph_buf = vmalloc(c->leb_size);
711 if (!c->orph_buf)
712 return -ENOMEM;
713 }
714
715 if (unclean)
716 err = kill_orphans(c);
717 else if (!read_only)
718 err = clear_orphans(c);
719
720 return err;
721}
722
723#ifdef CONFIG_UBIFS_FS_DEBUG
724
725struct check_orphan {
726 struct rb_node rb;
727 ino_t inum;
728};
729
730struct check_info {
731 unsigned long last_ino;
732 unsigned long tot_inos;
733 unsigned long missing;
734 unsigned long long leaf_cnt;
735 struct ubifs_ino_node *node;
736 struct rb_root root;
737};
738
739static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
740{
741 struct ubifs_orphan *o;
742 struct rb_node *p;
743
744 spin_lock(&c->orphan_lock);
745 p = c->orph_tree.rb_node;
746 while (p) {
747 o = rb_entry(p, struct ubifs_orphan, rb);
748 if (inum < o->inum)
749 p = p->rb_left;
750 else if (inum > o->inum)
751 p = p->rb_right;
752 else {
753 spin_unlock(&c->orphan_lock);
754 return 1;
755 }
756 }
757 spin_unlock(&c->orphan_lock);
758 return 0;
759}
760
761static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
762{
763 struct check_orphan *orphan, *o;
764 struct rb_node **p, *parent = NULL;
765
766 orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
767 if (!orphan)
768 return -ENOMEM;
769 orphan->inum = inum;
770
771 p = &root->rb_node;
772 while (*p) {
773 parent = *p;
774 o = rb_entry(parent, struct check_orphan, rb);
775 if (inum < o->inum)
776 p = &(*p)->rb_left;
777 else if (inum > o->inum)
778 p = &(*p)->rb_right;
779 else {
780 kfree(orphan);
781 return 0;
782 }
783 }
784 rb_link_node(&orphan->rb, parent, p);
785 rb_insert_color(&orphan->rb, root);
786 return 0;
787}
788
789static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
790{
791 struct check_orphan *o;
792 struct rb_node *p;
793
794 p = root->rb_node;
795 while (p) {
796 o = rb_entry(p, struct check_orphan, rb);
797 if (inum < o->inum)
798 p = p->rb_left;
799 else if (inum > o->inum)
800 p = p->rb_right;
801 else
802 return 1;
803 }
804 return 0;
805}
806
807static void dbg_free_check_tree(struct rb_root *root)
808{
809 struct rb_node *this = root->rb_node;
810 struct check_orphan *o;
811
812 while (this) {
813 if (this->rb_left) {
814 this = this->rb_left;
815 continue;
816 } else if (this->rb_right) {
817 this = this->rb_right;
818 continue;
819 }
820 o = rb_entry(this, struct check_orphan, rb);
821 this = rb_parent(this);
822 if (this) {
823 if (this->rb_left == &o->rb)
824 this->rb_left = NULL;
825 else
826 this->rb_right = NULL;
827 }
828 kfree(o);
829 }
830}
831
832static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
833 void *priv)
834{
835 struct check_info *ci = priv;
836 ino_t inum;
837 int err;
838
839 inum = key_inum(c, &zbr->key);
840 if (inum != ci->last_ino) {
841 /* Lowest node type is the inode node, so it comes first */
842 if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
843 ubifs_err("found orphan node ino %lu, type %d", inum,
844 key_type(c, &zbr->key));
845 ci->last_ino = inum;
846 ci->tot_inos += 1;
847 err = ubifs_tnc_read_node(c, zbr, ci->node);
848 if (err) {
849 ubifs_err("node read failed, error %d", err);
850 return err;
851 }
852 if (ci->node->nlink == 0)
853 /* Must be recorded as an orphan */
854 if (!dbg_find_check_orphan(&ci->root, inum) &&
855 !dbg_find_orphan(c, inum)) {
856 ubifs_err("missing orphan, ino %lu", inum);
857 ci->missing += 1;
858 }
859 }
860 ci->leaf_cnt += 1;
861 return 0;
862}
863
864static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
865{
866 struct ubifs_scan_node *snod;
867 struct ubifs_orph_node *orph;
868 ino_t inum;
869 int i, n, err;
870
871 list_for_each_entry(snod, &sleb->nodes, list) {
872 cond_resched();
873 if (snod->type != UBIFS_ORPH_NODE)
874 continue;
875 orph = snod->node;
876 n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
877 for (i = 0; i < n; i++) {
878 inum = le64_to_cpu(orph->inos[i]);
879 err = dbg_ins_check_orphan(&ci->root, inum);
880 if (err)
881 return err;
882 }
883 }
884 return 0;
885}
886
887static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
888{
889 int lnum, err = 0;
890
891 /* Check no-orphans flag and skip this if no orphans */
892 if (c->no_orphs)
893 return 0;
894
895 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
896 struct ubifs_scan_leb *sleb;
897
898 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
899 if (IS_ERR(sleb)) {
900 err = PTR_ERR(sleb);
901 break;
902 }
903
904 err = dbg_read_orphans(ci, sleb);
905 ubifs_scan_destroy(sleb);
906 if (err)
907 break;
908 }
909
910 return err;
911}
912
913static int dbg_check_orphans(struct ubifs_info *c)
914{
915 struct check_info ci;
916 int err;
917
918 if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
919 return 0;
920
921 ci.last_ino = 0;
922 ci.tot_inos = 0;
923 ci.missing = 0;
924 ci.leaf_cnt = 0;
925 ci.root = RB_ROOT;
926 ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
927 if (!ci.node) {
928 ubifs_err("out of memory");
929 return -ENOMEM;
930 }
931
932 err = dbg_scan_orphans(c, &ci);
933 if (err)
934 goto out;
935
936 err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
937 if (err) {
938 ubifs_err("cannot scan TNC, error %d", err);
939 goto out;
940 }
941
942 if (ci.missing) {
943 ubifs_err("%lu missing orphan(s)", ci.missing);
944 err = -EINVAL;
945 goto out;
946 }
947
948 dbg_cmt("last inode number is %lu", ci.last_ino);
949 dbg_cmt("total number of inodes is %lu", ci.tot_inos);
950 dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
951
952out:
953 dbg_free_check_tree(&ci.root);
954 kfree(ci.node);
955 return err;
956}
957
958#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
new file mode 100644
index 000000000000..77d26c141cf6
--- /dev/null
+++ b/fs/ubifs/recovery.c
@@ -0,0 +1,1519 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements functions needed to recover from unclean un-mounts.
25 * When UBIFS is mounted, it checks a flag on the master node to determine if
26 * an un-mount was completed sucessfully. If not, the process of mounting
27 * incorparates additional checking and fixing of on-flash data structures.
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
30 * read-only, and the flash is not modified in that case.
31 */
32
33#include <linux/crc32.h>
34#include "ubifs.h"
35
36/**
37 * is_empty - determine whether a buffer is empty (contains all 0xff).
38 * @buf: buffer to clean
39 * @len: length of buffer
40 *
41 * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
42 * %0 is returned.
43 */
44static int is_empty(void *buf, int len)
45{
46 uint8_t *p = buf;
47 int i;
48
49 for (i = 0; i < len; i++)
50 if (*p++ != 0xff)
51 return 0;
52 return 1;
53}
54
55/**
56 * get_master_node - get the last valid master node allowing for corruption.
57 * @c: UBIFS file-system description object
58 * @lnum: LEB number
59 * @pbuf: buffer containing the LEB read, is returned here
60 * @mst: master node, if found, is returned here
61 * @cor: corruption, if found, is returned here
62 *
63 * This function allocates a buffer, reads the LEB into it, and finds and
64 * returns the last valid master node allowing for one area of corruption.
65 * The corrupt area, if there is one, must be consistent with the assumption
66 * that it is the result of an unclean unmount while the master node was being
67 * written. Under those circumstances, it is valid to use the previously written
68 * master node.
69 *
70 * This function returns %0 on success and a negative error code on failure.
71 */
72static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
73 struct ubifs_mst_node **mst, void **cor)
74{
75 const int sz = c->mst_node_alsz;
76 int err, offs, len;
77 void *sbuf, *buf;
78
79 sbuf = vmalloc(c->leb_size);
80 if (!sbuf)
81 return -ENOMEM;
82
83 err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
84 if (err && err != -EBADMSG)
85 goto out_free;
86
87 /* Find the first position that is definitely not a node */
88 offs = 0;
89 buf = sbuf;
90 len = c->leb_size;
91 while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
92 struct ubifs_ch *ch = buf;
93
94 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
95 break;
96 offs += sz;
97 buf += sz;
98 len -= sz;
99 }
100 /* See if there was a valid master node before that */
101 if (offs) {
102 int ret;
103
104 offs -= sz;
105 buf -= sz;
106 len += sz;
107 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
108 if (ret != SCANNED_A_NODE && offs) {
109 /* Could have been corruption so check one place back */
110 offs -= sz;
111 buf -= sz;
112 len += sz;
113 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
114 if (ret != SCANNED_A_NODE)
115 /*
116 * We accept only one area of corruption because
117 * we are assuming that it was caused while
118 * trying to write a master node.
119 */
120 goto out_err;
121 }
122 if (ret == SCANNED_A_NODE) {
123 struct ubifs_ch *ch = buf;
124
125 if (ch->node_type != UBIFS_MST_NODE)
126 goto out_err;
127 dbg_rcvry("found a master node at %d:%d", lnum, offs);
128 *mst = buf;
129 offs += sz;
130 buf += sz;
131 len -= sz;
132 }
133 }
134 /* Check for corruption */
135 if (offs < c->leb_size) {
136 if (!is_empty(buf, min_t(int, len, sz))) {
137 *cor = buf;
138 dbg_rcvry("found corruption at %d:%d", lnum, offs);
139 }
140 offs += sz;
141 buf += sz;
142 len -= sz;
143 }
144 /* Check remaining empty space */
145 if (offs < c->leb_size)
146 if (!is_empty(buf, len))
147 goto out_err;
148 *pbuf = sbuf;
149 return 0;
150
151out_err:
152 err = -EINVAL;
153out_free:
154 vfree(sbuf);
155 *mst = NULL;
156 *cor = NULL;
157 return err;
158}
159
160/**
161 * write_rcvrd_mst_node - write recovered master node.
162 * @c: UBIFS file-system description object
163 * @mst: master node
164 *
165 * This function returns %0 on success and a negative error code on failure.
166 */
167static int write_rcvrd_mst_node(struct ubifs_info *c,
168 struct ubifs_mst_node *mst)
169{
170 int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
171 uint32_t save_flags;
172
173 dbg_rcvry("recovery");
174
175 save_flags = mst->flags;
176 mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
177
178 ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
179 err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
180 if (err)
181 goto out;
182 err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
183 if (err)
184 goto out;
185out:
186 mst->flags = save_flags;
187 return err;
188}
189
190/**
191 * ubifs_recover_master_node - recover the master node.
192 * @c: UBIFS file-system description object
193 *
194 * This function recovers the master node from corruption that may occur due to
195 * an unclean unmount.
196 *
197 * This function returns %0 on success and a negative error code on failure.
198 */
199int ubifs_recover_master_node(struct ubifs_info *c)
200{
201 void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
202 struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
203 const int sz = c->mst_node_alsz;
204 int err, offs1, offs2;
205
206 dbg_rcvry("recovery");
207
208 err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
209 if (err)
210 goto out_free;
211
212 err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
213 if (err)
214 goto out_free;
215
216 if (mst1) {
217 offs1 = (void *)mst1 - buf1;
218 if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
219 (offs1 == 0 && !cor1)) {
220 /*
221 * mst1 was written by recovery at offset 0 with no
222 * corruption.
223 */
224 dbg_rcvry("recovery recovery");
225 mst = mst1;
226 } else if (mst2) {
227 offs2 = (void *)mst2 - buf2;
228 if (offs1 == offs2) {
229 /* Same offset, so must be the same */
230 if (memcmp((void *)mst1 + UBIFS_CH_SZ,
231 (void *)mst2 + UBIFS_CH_SZ,
232 UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
233 goto out_err;
234 mst = mst1;
235 } else if (offs2 + sz == offs1) {
236 /* 1st LEB was written, 2nd was not */
237 if (cor1)
238 goto out_err;
239 mst = mst1;
240 } else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
241 /* 1st LEB was unmapped and written, 2nd not */
242 if (cor1)
243 goto out_err;
244 mst = mst1;
245 } else
246 goto out_err;
247 } else {
248 /*
249 * 2nd LEB was unmapped and about to be written, so
250 * there must be only one master node in the first LEB
251 * and no corruption.
252 */
253 if (offs1 != 0 || cor1)
254 goto out_err;
255 mst = mst1;
256 }
257 } else {
258 if (!mst2)
259 goto out_err;
260 /*
261 * 1st LEB was unmapped and about to be written, so there must
262 * be no room left in 2nd LEB.
263 */
264 offs2 = (void *)mst2 - buf2;
265 if (offs2 + sz + sz <= c->leb_size)
266 goto out_err;
267 mst = mst2;
268 }
269
270 dbg_rcvry("recovered master node from LEB %d",
271 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
272
273 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
274
275 if ((c->vfs_sb->s_flags & MS_RDONLY)) {
276 /* Read-only mode. Keep a copy for switching to rw mode */
277 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
278 if (!c->rcvrd_mst_node) {
279 err = -ENOMEM;
280 goto out_free;
281 }
282 memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
283 } else {
284 /* Write the recovered master node */
285 c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
286 err = write_rcvrd_mst_node(c, c->mst_node);
287 if (err)
288 goto out_free;
289 }
290
291 vfree(buf2);
292 vfree(buf1);
293
294 return 0;
295
296out_err:
297 err = -EINVAL;
298out_free:
299 ubifs_err("failed to recover master node");
300 if (mst1) {
301 dbg_err("dumping first master node");
302 dbg_dump_node(c, mst1);
303 }
304 if (mst2) {
305 dbg_err("dumping second master node");
306 dbg_dump_node(c, mst2);
307 }
308 vfree(buf2);
309 vfree(buf1);
310 return err;
311}
312
313/**
314 * ubifs_write_rcvrd_mst_node - write the recovered master node.
315 * @c: UBIFS file-system description object
316 *
317 * This function writes the master node that was recovered during mounting in
318 * read-only mode and must now be written because we are remounting rw.
319 *
320 * This function returns %0 on success and a negative error code on failure.
321 */
322int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
323{
324 int err;
325
326 if (!c->rcvrd_mst_node)
327 return 0;
328 c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
329 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
330 err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
331 if (err)
332 return err;
333 kfree(c->rcvrd_mst_node);
334 c->rcvrd_mst_node = NULL;
335 return 0;
336}
337
338/**
339 * is_last_write - determine if an offset was in the last write to a LEB.
340 * @c: UBIFS file-system description object
341 * @buf: buffer to check
342 * @offs: offset to check
343 *
344 * This function returns %1 if @offs was in the last write to the LEB whose data
345 * is in @buf, otherwise %0 is returned. The determination is made by checking
346 * for subsequent empty space starting from the next min_io_size boundary (or a
347 * bit less than the common header size if min_io_size is one).
348 */
349static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
350{
351 int empty_offs;
352 int check_len;
353 uint8_t *p;
354
355 if (c->min_io_size == 1) {
356 check_len = c->leb_size - offs;
357 p = buf + check_len;
358 for (; check_len > 0; check_len--)
359 if (*--p != 0xff)
360 break;
361 /*
362 * 'check_len' is the size of the corruption which cannot be
363 * more than the size of 1 node if it was caused by an unclean
364 * unmount.
365 */
366 if (check_len > UBIFS_MAX_NODE_SZ)
367 return 0;
368 return 1;
369 }
370
371 /*
372 * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
373 * last wbuf written. After that should be empty space.
374 */
375 empty_offs = ALIGN(offs + 1, c->min_io_size);
376 check_len = c->leb_size - empty_offs;
377 p = buf + empty_offs - offs;
378
379 for (; check_len > 0; check_len--)
380 if (*p++ != 0xff)
381 return 0;
382 return 1;
383}
384
385/**
386 * clean_buf - clean the data from an LEB sitting in a buffer.
387 * @c: UBIFS file-system description object
388 * @buf: buffer to clean
389 * @lnum: LEB number to clean
390 * @offs: offset from which to clean
391 * @len: length of buffer
392 *
393 * This function pads up to the next min_io_size boundary (if there is one) and
394 * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
395 * min_io_size boundary (if there is one).
396 */
397static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
398 int *offs, int *len)
399{
400 int empty_offs, pad_len;
401
402 lnum = lnum;
403 dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
404
405 if (c->min_io_size == 1) {
406 memset(*buf, 0xff, c->leb_size - *offs);
407 return;
408 }
409
410 ubifs_assert(!(*offs & 7));
411 empty_offs = ALIGN(*offs, c->min_io_size);
412 pad_len = empty_offs - *offs;
413 ubifs_pad(c, *buf, pad_len);
414 *offs += pad_len;
415 *buf += pad_len;
416 *len -= pad_len;
417 memset(*buf, 0xff, c->leb_size - empty_offs);
418}
419
420/**
421 * no_more_nodes - determine if there are no more nodes in a buffer.
422 * @c: UBIFS file-system description object
423 * @buf: buffer to check
424 * @len: length of buffer
425 * @lnum: LEB number of the LEB from which @buf was read
426 * @offs: offset from which @buf was read
427 *
428 * This function scans @buf for more nodes and returns %0 is a node is found and
429 * %1 if no more nodes are found.
430 */
431static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
432 int lnum, int offs)
433{
434 int skip, next_offs = 0;
435
436 if (len > UBIFS_DATA_NODE_SZ) {
437 struct ubifs_ch *ch = buf;
438 int dlen = le32_to_cpu(ch->len);
439
440 if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
441 dlen <= UBIFS_MAX_DATA_NODE_SZ)
442 /* The corrupt node looks like a data node */
443 next_offs = ALIGN(offs + dlen, 8);
444 }
445
446 if (c->min_io_size == 1)
447 skip = 8;
448 else
449 skip = ALIGN(offs + 1, c->min_io_size) - offs;
450
451 offs += skip;
452 buf += skip;
453 len -= skip;
454 while (len > 8) {
455 struct ubifs_ch *ch = buf;
456 uint32_t magic = le32_to_cpu(ch->magic);
457 int ret;
458
459 if (magic == UBIFS_NODE_MAGIC) {
460 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
461 if (ret == SCANNED_A_NODE || ret > 0) {
462 /*
463 * There is a small chance this is just data in
464 * a data node, so check that possibility. e.g.
465 * this is part of a file that itself contains
466 * a UBIFS image.
467 */
468 if (next_offs && offs + le32_to_cpu(ch->len) <=
469 next_offs)
470 continue;
471 dbg_rcvry("unexpected node at %d:%d", lnum,
472 offs);
473 return 0;
474 }
475 }
476 offs += 8;
477 buf += 8;
478 len -= 8;
479 }
480 return 1;
481}
482
483/**
484 * fix_unclean_leb - fix an unclean LEB.
485 * @c: UBIFS file-system description object
486 * @sleb: scanned LEB information
487 * @start: offset where scan started
488 */
489static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
490 int start)
491{
492 int lnum = sleb->lnum, endpt = start;
493
494 /* Get the end offset of the last node we are keeping */
495 if (!list_empty(&sleb->nodes)) {
496 struct ubifs_scan_node *snod;
497
498 snod = list_entry(sleb->nodes.prev,
499 struct ubifs_scan_node, list);
500 endpt = snod->offs + snod->len;
501 }
502
503 if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
504 /* Add to recovery list */
505 struct ubifs_unclean_leb *ucleb;
506
507 dbg_rcvry("need to fix LEB %d start %d endpt %d",
508 lnum, start, sleb->endpt);
509 ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
510 if (!ucleb)
511 return -ENOMEM;
512 ucleb->lnum = lnum;
513 ucleb->endpt = endpt;
514 list_add_tail(&ucleb->list, &c->unclean_leb_list);
515 } else {
516 /* Write the fixed LEB back to flash */
517 int err;
518
519 dbg_rcvry("fixing LEB %d start %d endpt %d",
520 lnum, start, sleb->endpt);
521 if (endpt == 0) {
522 err = ubifs_leb_unmap(c, lnum);
523 if (err)
524 return err;
525 } else {
526 int len = ALIGN(endpt, c->min_io_size);
527
528 if (start) {
529 err = ubi_read(c->ubi, lnum, sleb->buf, 0,
530 start);
531 if (err)
532 return err;
533 }
534 /* Pad to min_io_size */
535 if (len > endpt) {
536 int pad_len = len - ALIGN(endpt, 8);
537
538 if (pad_len > 0) {
539 void *buf = sleb->buf + len - pad_len;
540
541 ubifs_pad(c, buf, pad_len);
542 }
543 }
544 err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
545 UBI_UNKNOWN);
546 if (err)
547 return err;
548 }
549 }
550 return 0;
551}
552
553/**
554 * drop_incomplete_group - drop nodes from an incomplete group.
555 * @sleb: scanned LEB information
556 * @offs: offset of dropped nodes is returned here
557 *
558 * This function returns %1 if nodes are dropped and %0 otherwise.
559 */
560static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
561{
562 int dropped = 0;
563
564 while (!list_empty(&sleb->nodes)) {
565 struct ubifs_scan_node *snod;
566 struct ubifs_ch *ch;
567
568 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
569 list);
570 ch = snod->node;
571 if (ch->group_type != UBIFS_IN_NODE_GROUP)
572 return dropped;
573 dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
574 *offs = snod->offs;
575 list_del(&snod->list);
576 kfree(snod);
577 sleb->nodes_cnt -= 1;
578 dropped = 1;
579 }
580 return dropped;
581}
582
583/**
584 * ubifs_recover_leb - scan and recover a LEB.
585 * @c: UBIFS file-system description object
586 * @lnum: LEB number
587 * @offs: offset
588 * @sbuf: LEB-sized buffer to use
589 * @grouped: nodes may be grouped for recovery
590 *
591 * This function does a scan of a LEB, but caters for errors that might have
592 * been caused by the unclean unmount from which we are attempting to recover.
593 *
594 * This function returns %0 on success and a negative error code on failure.
595 */
596struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
597 int offs, void *sbuf, int grouped)
598{
599 int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
600 int empty_chkd = 0, start = offs;
601 struct ubifs_scan_leb *sleb;
602 void *buf = sbuf + offs;
603
604 dbg_rcvry("%d:%d", lnum, offs);
605
606 sleb = ubifs_start_scan(c, lnum, offs, sbuf);
607 if (IS_ERR(sleb))
608 return sleb;
609
610 if (sleb->ecc)
611 need_clean = 1;
612
613 while (len >= 8) {
614 int ret;
615
616 dbg_scan("look at LEB %d:%d (%d bytes left)",
617 lnum, offs, len);
618
619 cond_resched();
620
621 /*
622 * Scan quietly until there is an error from which we cannot
623 * recover
624 */
625 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
626
627 if (ret == SCANNED_A_NODE) {
628 /* A valid node, and not a padding node */
629 struct ubifs_ch *ch = buf;
630 int node_len;
631
632 err = ubifs_add_snod(c, sleb, buf, offs);
633 if (err)
634 goto error;
635 node_len = ALIGN(le32_to_cpu(ch->len), 8);
636 offs += node_len;
637 buf += node_len;
638 len -= node_len;
639 continue;
640 }
641
642 if (ret > 0) {
643 /* Padding bytes or a valid padding node */
644 offs += ret;
645 buf += ret;
646 len -= ret;
647 continue;
648 }
649
650 if (ret == SCANNED_EMPTY_SPACE) {
651 if (!is_empty(buf, len)) {
652 if (!is_last_write(c, buf, offs))
653 break;
654 clean_buf(c, &buf, lnum, &offs, &len);
655 need_clean = 1;
656 }
657 empty_chkd = 1;
658 break;
659 }
660
661 if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
662 if (is_last_write(c, buf, offs)) {
663 clean_buf(c, &buf, lnum, &offs, &len);
664 need_clean = 1;
665 empty_chkd = 1;
666 break;
667 }
668
669 if (ret == SCANNED_A_CORRUPT_NODE)
670 if (no_more_nodes(c, buf, len, lnum, offs)) {
671 clean_buf(c, &buf, lnum, &offs, &len);
672 need_clean = 1;
673 empty_chkd = 1;
674 break;
675 }
676
677 if (quiet) {
678 /* Redo the last scan but noisily */
679 quiet = 0;
680 continue;
681 }
682
683 switch (ret) {
684 case SCANNED_GARBAGE:
685 dbg_err("garbage");
686 goto corrupted;
687 case SCANNED_A_CORRUPT_NODE:
688 case SCANNED_A_BAD_PAD_NODE:
689 dbg_err("bad node");
690 goto corrupted;
691 default:
692 dbg_err("unknown");
693 goto corrupted;
694 }
695 }
696
697 if (!empty_chkd && !is_empty(buf, len)) {
698 if (is_last_write(c, buf, offs)) {
699 clean_buf(c, &buf, lnum, &offs, &len);
700 need_clean = 1;
701 } else {
702 ubifs_err("corrupt empty space at LEB %d:%d",
703 lnum, offs);
704 goto corrupted;
705 }
706 }
707
708 /* Drop nodes from incomplete group */
709 if (grouped && drop_incomplete_group(sleb, &offs)) {
710 buf = sbuf + offs;
711 len = c->leb_size - offs;
712 clean_buf(c, &buf, lnum, &offs, &len);
713 need_clean = 1;
714 }
715
716 if (offs % c->min_io_size) {
717 clean_buf(c, &buf, lnum, &offs, &len);
718 need_clean = 1;
719 }
720
721 ubifs_end_scan(c, sleb, lnum, offs);
722
723 if (need_clean) {
724 err = fix_unclean_leb(c, sleb, start);
725 if (err)
726 goto error;
727 }
728
729 return sleb;
730
731corrupted:
732 ubifs_scanned_corruption(c, lnum, offs, buf);
733 err = -EUCLEAN;
734error:
735 ubifs_err("LEB %d scanning failed", lnum);
736 ubifs_scan_destroy(sleb);
737 return ERR_PTR(err);
738}
739
740/**
741 * get_cs_sqnum - get commit start sequence number.
742 * @c: UBIFS file-system description object
743 * @lnum: LEB number of commit start node
744 * @offs: offset of commit start node
745 * @cs_sqnum: commit start sequence number is returned here
746 *
747 * This function returns %0 on success and a negative error code on failure.
748 */
749static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
750 unsigned long long *cs_sqnum)
751{
752 struct ubifs_cs_node *cs_node = NULL;
753 int err, ret;
754
755 dbg_rcvry("at %d:%d", lnum, offs);
756 cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
757 if (!cs_node)
758 return -ENOMEM;
759 if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
760 goto out_err;
761 err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
762 if (err && err != -EBADMSG)
763 goto out_free;
764 ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
765 if (ret != SCANNED_A_NODE) {
766 dbg_err("Not a valid node");
767 goto out_err;
768 }
769 if (cs_node->ch.node_type != UBIFS_CS_NODE) {
770 dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
771 goto out_err;
772 }
773 if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
774 dbg_err("CS node cmt_no %llu != current cmt_no %llu",
775 (unsigned long long)le64_to_cpu(cs_node->cmt_no),
776 c->cmt_no);
777 goto out_err;
778 }
779 *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
780 dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
781 kfree(cs_node);
782 return 0;
783
784out_err:
785 err = -EINVAL;
786out_free:
787 ubifs_err("failed to get CS sqnum");
788 kfree(cs_node);
789 return err;
790}
791
792/**
793 * ubifs_recover_log_leb - scan and recover a log LEB.
794 * @c: UBIFS file-system description object
795 * @lnum: LEB number
796 * @offs: offset
797 * @sbuf: LEB-sized buffer to use
798 *
799 * This function does a scan of a LEB, but caters for errors that might have
800 * been caused by the unclean unmount from which we are attempting to recover.
801 *
802 * This function returns %0 on success and a negative error code on failure.
803 */
804struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
805 int offs, void *sbuf)
806{
807 struct ubifs_scan_leb *sleb;
808 int next_lnum;
809
810 dbg_rcvry("LEB %d", lnum);
811 next_lnum = lnum + 1;
812 if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
813 next_lnum = UBIFS_LOG_LNUM;
814 if (next_lnum != c->ltail_lnum) {
815 /*
816 * We can only recover at the end of the log, so check that the
817 * next log LEB is empty or out of date.
818 */
819 sleb = ubifs_scan(c, next_lnum, 0, sbuf);
820 if (IS_ERR(sleb))
821 return sleb;
822 if (sleb->nodes_cnt) {
823 struct ubifs_scan_node *snod;
824 unsigned long long cs_sqnum = c->cs_sqnum;
825
826 snod = list_entry(sleb->nodes.next,
827 struct ubifs_scan_node, list);
828 if (cs_sqnum == 0) {
829 int err;
830
831 err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
832 if (err) {
833 ubifs_scan_destroy(sleb);
834 return ERR_PTR(err);
835 }
836 }
837 if (snod->sqnum > cs_sqnum) {
838 ubifs_err("unrecoverable log corruption "
839 "in LEB %d", lnum);
840 ubifs_scan_destroy(sleb);
841 return ERR_PTR(-EUCLEAN);
842 }
843 }
844 ubifs_scan_destroy(sleb);
845 }
846 return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
847}
848
849/**
850 * recover_head - recover a head.
851 * @c: UBIFS file-system description object
852 * @lnum: LEB number of head to recover
853 * @offs: offset of head to recover
854 * @sbuf: LEB-sized buffer to use
855 *
856 * This function ensures that there is no data on the flash at a head location.
857 *
858 * This function returns %0 on success and a negative error code on failure.
859 */
860static int recover_head(const struct ubifs_info *c, int lnum, int offs,
861 void *sbuf)
862{
863 int len, err, need_clean = 0;
864
865 if (c->min_io_size > 1)
866 len = c->min_io_size;
867 else
868 len = 512;
869 if (offs + len > c->leb_size)
870 len = c->leb_size - offs;
871
872 if (!len)
873 return 0;
874
875 /* Read at the head location and check it is empty flash */
876 err = ubi_read(c->ubi, lnum, sbuf, offs, len);
877 if (err)
878 need_clean = 1;
879 else {
880 uint8_t *p = sbuf;
881
882 while (len--)
883 if (*p++ != 0xff) {
884 need_clean = 1;
885 break;
886 }
887 }
888
889 if (need_clean) {
890 dbg_rcvry("cleaning head at %d:%d", lnum, offs);
891 if (offs == 0)
892 return ubifs_leb_unmap(c, lnum);
893 err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
894 if (err)
895 return err;
896 return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
897 }
898
899 return 0;
900}
901
902/**
903 * ubifs_recover_inl_heads - recover index and LPT heads.
904 * @c: UBIFS file-system description object
905 * @sbuf: LEB-sized buffer to use
906 *
907 * This function ensures that there is no data on the flash at the index and
908 * LPT head locations.
909 *
910 * This deals with the recovery of a half-completed journal commit. UBIFS is
911 * careful never to overwrite the last version of the index or the LPT. Because
912 * the index and LPT are wandering trees, data from a half-completed commit will
913 * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
914 * assumed to be empty and will be unmapped anyway before use, or in the index
915 * and LPT heads.
916 *
917 * This function returns %0 on success and a negative error code on failure.
918 */
919int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
920{
921 int err;
922
923 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
924
925 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
926 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
927 if (err)
928 return err;
929
930 dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
931 err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
932 if (err)
933 return err;
934
935 return 0;
936}
937
938/**
939 * clean_an_unclean_leb - read and write a LEB to remove corruption.
940 * @c: UBIFS file-system description object
941 * @ucleb: unclean LEB information
942 * @sbuf: LEB-sized buffer to use
943 *
944 * This function reads a LEB up to a point pre-determined by the mount recovery,
945 * checks the nodes, and writes the result back to the flash, thereby cleaning
946 * off any following corruption, or non-fatal ECC errors.
947 *
948 * This function returns %0 on success and a negative error code on failure.
949 */
950static int clean_an_unclean_leb(const struct ubifs_info *c,
951 struct ubifs_unclean_leb *ucleb, void *sbuf)
952{
953 int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
954 void *buf = sbuf;
955
956 dbg_rcvry("LEB %d len %d", lnum, len);
957
958 if (len == 0) {
959 /* Nothing to read, just unmap it */
960 err = ubifs_leb_unmap(c, lnum);
961 if (err)
962 return err;
963 return 0;
964 }
965
966 err = ubi_read(c->ubi, lnum, buf, offs, len);
967 if (err && err != -EBADMSG)
968 return err;
969
970 while (len >= 8) {
971 int ret;
972
973 cond_resched();
974
975 /* Scan quietly until there is an error */
976 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
977
978 if (ret == SCANNED_A_NODE) {
979 /* A valid node, and not a padding node */
980 struct ubifs_ch *ch = buf;
981 int node_len;
982
983 node_len = ALIGN(le32_to_cpu(ch->len), 8);
984 offs += node_len;
985 buf += node_len;
986 len -= node_len;
987 continue;
988 }
989
990 if (ret > 0) {
991 /* Padding bytes or a valid padding node */
992 offs += ret;
993 buf += ret;
994 len -= ret;
995 continue;
996 }
997
998 if (ret == SCANNED_EMPTY_SPACE) {
999 ubifs_err("unexpected empty space at %d:%d",
1000 lnum, offs);
1001 return -EUCLEAN;
1002 }
1003
1004 if (quiet) {
1005 /* Redo the last scan but noisily */
1006 quiet = 0;
1007 continue;
1008 }
1009
1010 ubifs_scanned_corruption(c, lnum, offs, buf);
1011 return -EUCLEAN;
1012 }
1013
1014 /* Pad to min_io_size */
1015 len = ALIGN(ucleb->endpt, c->min_io_size);
1016 if (len > ucleb->endpt) {
1017 int pad_len = len - ALIGN(ucleb->endpt, 8);
1018
1019 if (pad_len > 0) {
1020 buf = c->sbuf + len - pad_len;
1021 ubifs_pad(c, buf, pad_len);
1022 }
1023 }
1024
1025 /* Write back the LEB atomically */
1026 err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
1027 if (err)
1028 return err;
1029
1030 dbg_rcvry("cleaned LEB %d", lnum);
1031
1032 return 0;
1033}
1034
1035/**
1036 * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
1037 * @c: UBIFS file-system description object
1038 * @sbuf: LEB-sized buffer to use
1039 *
1040 * This function cleans a LEB identified during recovery that needs to be
1041 * written but was not because UBIFS was mounted read-only. This happens when
1042 * remounting to read-write mode.
1043 *
1044 * This function returns %0 on success and a negative error code on failure.
1045 */
1046int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
1047{
1048 dbg_rcvry("recovery");
1049 while (!list_empty(&c->unclean_leb_list)) {
1050 struct ubifs_unclean_leb *ucleb;
1051 int err;
1052
1053 ucleb = list_entry(c->unclean_leb_list.next,
1054 struct ubifs_unclean_leb, list);
1055 err = clean_an_unclean_leb(c, ucleb, sbuf);
1056 if (err)
1057 return err;
1058 list_del(&ucleb->list);
1059 kfree(ucleb);
1060 }
1061 return 0;
1062}
1063
1064/**
1065 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
1066 * @c: UBIFS file-system description object
1067 *
1068 * Out-of-place garbage collection requires always one empty LEB with which to
1069 * start garbage collection. The LEB number is recorded in c->gc_lnum and is
1070 * written to the master node on unmounting. In the case of an unclean unmount
1071 * the value of gc_lnum recorded in the master node is out of date and cannot
1072 * be used. Instead, recovery must allocate an empty LEB for this purpose.
1073 * However, there may not be enough empty space, in which case it must be
1074 * possible to GC the dirtiest LEB into the GC head LEB.
1075 *
1076 * This function also runs the commit which causes the TNC updates from
1077 * size-recovery and orphans to be written to the flash. That is important to
1078 * ensure correct replay order for subsequent mounts.
1079 *
1080 * This function returns %0 on success and a negative error code on failure.
1081 */
1082int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1083{
1084 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
1085 struct ubifs_lprops lp;
1086 int lnum, err;
1087
1088 c->gc_lnum = -1;
1089 if (wbuf->lnum == -1) {
1090 dbg_rcvry("no GC head LEB");
1091 goto find_free;
1092 }
1093 /*
1094 * See whether the used space in the dirtiest LEB fits in the GC head
1095 * LEB.
1096 */
1097 if (wbuf->offs == c->leb_size) {
1098 dbg_rcvry("no room in GC head LEB");
1099 goto find_free;
1100 }
1101 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
1102 if (err) {
1103 if (err == -ENOSPC)
1104 dbg_err("could not find a dirty LEB");
1105 return err;
1106 }
1107 ubifs_assert(!(lp.flags & LPROPS_INDEX));
1108 lnum = lp.lnum;
1109 if (lp.free + lp.dirty == c->leb_size) {
1110 /* An empty LEB was returned */
1111 if (lp.free != c->leb_size) {
1112 err = ubifs_change_one_lp(c, lnum, c->leb_size,
1113 0, 0, 0, 0);
1114 if (err)
1115 return err;
1116 }
1117 err = ubifs_leb_unmap(c, lnum);
1118 if (err)
1119 return err;
1120 c->gc_lnum = lnum;
1121 dbg_rcvry("allocated LEB %d for GC", lnum);
1122 /* Run the commit */
1123 dbg_rcvry("committing");
1124 return ubifs_run_commit(c);
1125 }
1126 /*
1127 * There was no empty LEB so the used space in the dirtiest LEB must fit
1128 * in the GC head LEB.
1129 */
1130 if (lp.free + lp.dirty < wbuf->offs) {
1131 dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
1132 lnum, wbuf->lnum, wbuf->offs);
1133 err = ubifs_return_leb(c, lnum);
1134 if (err)
1135 return err;
1136 goto find_free;
1137 }
1138 /*
1139 * We run the commit before garbage collection otherwise subsequent
1140 * mounts will see the GC and orphan deletion in a different order.
1141 */
1142 dbg_rcvry("committing");
1143 err = ubifs_run_commit(c);
1144 if (err)
1145 return err;
1146 /*
1147 * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
1148 * - use locking to keep 'ubifs_assert()' happy.
1149 */
1150 dbg_rcvry("GC'ing LEB %d", lnum);
1151 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
1152 err = ubifs_garbage_collect_leb(c, &lp);
1153 if (err >= 0) {
1154 int err2 = ubifs_wbuf_sync_nolock(wbuf);
1155
1156 if (err2)
1157 err = err2;
1158 }
1159 mutex_unlock(&wbuf->io_mutex);
1160 if (err < 0) {
1161 dbg_err("GC failed, error %d", err);
1162 if (err == -EAGAIN)
1163 err = -EINVAL;
1164 return err;
1165 }
1166 if (err != LEB_RETAINED) {
1167 dbg_err("GC returned %d", err);
1168 return -EINVAL;
1169 }
1170 err = ubifs_leb_unmap(c, c->gc_lnum);
1171 if (err)
1172 return err;
1173 dbg_rcvry("allocated LEB %d for GC", lnum);
1174 return 0;
1175
1176find_free:
1177 /*
1178 * There is no GC head LEB or the free space in the GC head LEB is too
1179 * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
1180 * GC is not run.
1181 */
1182 lnum = ubifs_find_free_leb_for_idx(c);
1183 if (lnum < 0) {
1184 dbg_err("could not find an empty LEB");
1185 return lnum;
1186 }
1187 /* And reset the index flag */
1188 err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
1189 LPROPS_INDEX, 0);
1190 if (err)
1191 return err;
1192 c->gc_lnum = lnum;
1193 dbg_rcvry("allocated LEB %d for GC", lnum);
1194 /* Run the commit */
1195 dbg_rcvry("committing");
1196 return ubifs_run_commit(c);
1197}
1198
1199/**
1200 * struct size_entry - inode size information for recovery.
1201 * @rb: link in the RB-tree of sizes
1202 * @inum: inode number
1203 * @i_size: size on inode
1204 * @d_size: maximum size based on data nodes
1205 * @exists: indicates whether the inode exists
1206 * @inode: inode if pinned in memory awaiting rw mode to fix it
1207 */
1208struct size_entry {
1209 struct rb_node rb;
1210 ino_t inum;
1211 loff_t i_size;
1212 loff_t d_size;
1213 int exists;
1214 struct inode *inode;
1215};
1216
1217/**
1218 * add_ino - add an entry to the size tree.
1219 * @c: UBIFS file-system description object
1220 * @inum: inode number
1221 * @i_size: size on inode
1222 * @d_size: maximum size based on data nodes
1223 * @exists: indicates whether the inode exists
1224 */
1225static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
1226 loff_t d_size, int exists)
1227{
1228 struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
1229 struct size_entry *e;
1230
1231 while (*p) {
1232 parent = *p;
1233 e = rb_entry(parent, struct size_entry, rb);
1234 if (inum < e->inum)
1235 p = &(*p)->rb_left;
1236 else
1237 p = &(*p)->rb_right;
1238 }
1239
1240 e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
1241 if (!e)
1242 return -ENOMEM;
1243
1244 e->inum = inum;
1245 e->i_size = i_size;
1246 e->d_size = d_size;
1247 e->exists = exists;
1248
1249 rb_link_node(&e->rb, parent, p);
1250 rb_insert_color(&e->rb, &c->size_tree);
1251
1252 return 0;
1253}
1254
1255/**
1256 * find_ino - find an entry on the size tree.
1257 * @c: UBIFS file-system description object
1258 * @inum: inode number
1259 */
1260static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
1261{
1262 struct rb_node *p = c->size_tree.rb_node;
1263 struct size_entry *e;
1264
1265 while (p) {
1266 e = rb_entry(p, struct size_entry, rb);
1267 if (inum < e->inum)
1268 p = p->rb_left;
1269 else if (inum > e->inum)
1270 p = p->rb_right;
1271 else
1272 return e;
1273 }
1274 return NULL;
1275}
1276
1277/**
1278 * remove_ino - remove an entry from the size tree.
1279 * @c: UBIFS file-system description object
1280 * @inum: inode number
1281 */
1282static void remove_ino(struct ubifs_info *c, ino_t inum)
1283{
1284 struct size_entry *e = find_ino(c, inum);
1285
1286 if (!e)
1287 return;
1288 rb_erase(&e->rb, &c->size_tree);
1289 kfree(e);
1290}
1291
1292/**
1293 * ubifs_destroy_size_tree - free resources related to the size tree.
1294 * @c: UBIFS file-system description object
1295 */
1296void ubifs_destroy_size_tree(struct ubifs_info *c)
1297{
1298 struct rb_node *this = c->size_tree.rb_node;
1299 struct size_entry *e;
1300
1301 while (this) {
1302 if (this->rb_left) {
1303 this = this->rb_left;
1304 continue;
1305 } else if (this->rb_right) {
1306 this = this->rb_right;
1307 continue;
1308 }
1309 e = rb_entry(this, struct size_entry, rb);
1310 if (e->inode)
1311 iput(e->inode);
1312 this = rb_parent(this);
1313 if (this) {
1314 if (this->rb_left == &e->rb)
1315 this->rb_left = NULL;
1316 else
1317 this->rb_right = NULL;
1318 }
1319 kfree(e);
1320 }
1321 c->size_tree = RB_ROOT;
1322}
1323
1324/**
1325 * ubifs_recover_size_accum - accumulate inode sizes for recovery.
1326 * @c: UBIFS file-system description object
1327 * @key: node key
1328 * @deletion: node is for a deletion
1329 * @new_size: inode size
1330 *
1331 * This function has two purposes:
1332 * 1) to ensure there are no data nodes that fall outside the inode size
1333 * 2) to ensure there are no data nodes for inodes that do not exist
1334 * To accomplish those purposes, a rb-tree is constructed containing an entry
1335 * for each inode number in the journal that has not been deleted, and recording
1336 * the size from the inode node, the maximum size of any data node (also altered
1337 * by truncations) and a flag indicating a inode number for which no inode node
1338 * was present in the journal.
1339 *
1340 * Note that there is still the possibility that there are data nodes that have
1341 * been committed that are beyond the inode size, however the only way to find
1342 * them would be to scan the entire index. Alternatively, some provision could
1343 * be made to record the size of inodes at the start of commit, which would seem
1344 * very cumbersome for a scenario that is quite unlikely and the only negative
1345 * consequence of which is wasted space.
1346 *
1347 * This functions returns %0 on success and a negative error code on failure.
1348 */
1349int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
1350 int deletion, loff_t new_size)
1351{
1352 ino_t inum = key_inum(c, key);
1353 struct size_entry *e;
1354 int err;
1355
1356 switch (key_type(c, key)) {
1357 case UBIFS_INO_KEY:
1358 if (deletion)
1359 remove_ino(c, inum);
1360 else {
1361 e = find_ino(c, inum);
1362 if (e) {
1363 e->i_size = new_size;
1364 e->exists = 1;
1365 } else {
1366 err = add_ino(c, inum, new_size, 0, 1);
1367 if (err)
1368 return err;
1369 }
1370 }
1371 break;
1372 case UBIFS_DATA_KEY:
1373 e = find_ino(c, inum);
1374 if (e) {
1375 if (new_size > e->d_size)
1376 e->d_size = new_size;
1377 } else {
1378 err = add_ino(c, inum, 0, new_size, 0);
1379 if (err)
1380 return err;
1381 }
1382 break;
1383 case UBIFS_TRUN_KEY:
1384 e = find_ino(c, inum);
1385 if (e)
1386 e->d_size = new_size;
1387 break;
1388 }
1389 return 0;
1390}
1391
1392/**
1393 * fix_size_in_place - fix inode size in place on flash.
1394 * @c: UBIFS file-system description object
1395 * @e: inode size information for recovery
1396 */
1397static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
1398{
1399 struct ubifs_ino_node *ino = c->sbuf;
1400 unsigned char *p;
1401 union ubifs_key key;
1402 int err, lnum, offs, len;
1403 loff_t i_size;
1404 uint32_t crc;
1405
1406 /* Locate the inode node LEB number and offset */
1407 ino_key_init(c, &key, e->inum);
1408 err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
1409 if (err)
1410 goto out;
1411 /*
1412 * If the size recorded on the inode node is greater than the size that
1413 * was calculated from nodes in the journal then don't change the inode.
1414 */
1415 i_size = le64_to_cpu(ino->size);
1416 if (i_size >= e->d_size)
1417 return 0;
1418 /* Read the LEB */
1419 err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
1420 if (err)
1421 goto out;
1422 /* Change the size field and recalculate the CRC */
1423 ino = c->sbuf + offs;
1424 ino->size = cpu_to_le64(e->d_size);
1425 len = le32_to_cpu(ino->ch.len);
1426 crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
1427 ino->ch.crc = cpu_to_le32(crc);
1428 /* Work out where data in the LEB ends and free space begins */
1429 p = c->sbuf;
1430 len = c->leb_size - 1;
1431 while (p[len] == 0xff)
1432 len -= 1;
1433 len = ALIGN(len + 1, c->min_io_size);
1434 /* Atomically write the fixed LEB back again */
1435 err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
1436 if (err)
1437 goto out;
1438 dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
1439 i_size, e->d_size);
1440 return 0;
1441
1442out:
1443 ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
1444 e->inum, e->i_size, e->d_size, err);
1445 return err;
1446}
1447
1448/**
1449 * ubifs_recover_size - recover inode size.
1450 * @c: UBIFS file-system description object
1451 *
1452 * This function attempts to fix inode size discrepancies identified by the
1453 * 'ubifs_recover_size_accum()' function.
1454 *
1455 * This functions returns %0 on success and a negative error code on failure.
1456 */
1457int ubifs_recover_size(struct ubifs_info *c)
1458{
1459 struct rb_node *this = rb_first(&c->size_tree);
1460
1461 while (this) {
1462 struct size_entry *e;
1463 int err;
1464
1465 e = rb_entry(this, struct size_entry, rb);
1466 if (!e->exists) {
1467 union ubifs_key key;
1468
1469 ino_key_init(c, &key, e->inum);
1470 err = ubifs_tnc_lookup(c, &key, c->sbuf);
1471 if (err && err != -ENOENT)
1472 return err;
1473 if (err == -ENOENT) {
1474 /* Remove data nodes that have no inode */
1475 dbg_rcvry("removing ino %lu", e->inum);
1476 err = ubifs_tnc_remove_ino(c, e->inum);
1477 if (err)
1478 return err;
1479 } else {
1480 struct ubifs_ino_node *ino = c->sbuf;
1481
1482 e->exists = 1;
1483 e->i_size = le64_to_cpu(ino->size);
1484 }
1485 }
1486 if (e->exists && e->i_size < e->d_size) {
1487 if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
1488 /* Fix the inode size and pin it in memory */
1489 struct inode *inode;
1490
1491 inode = ubifs_iget(c->vfs_sb, e->inum);
1492 if (IS_ERR(inode))
1493 return PTR_ERR(inode);
1494 if (inode->i_size < e->d_size) {
1495 dbg_rcvry("ino %lu size %lld -> %lld",
1496 e->inum, e->d_size,
1497 inode->i_size);
1498 inode->i_size = e->d_size;
1499 ubifs_inode(inode)->ui_size = e->d_size;
1500 e->inode = inode;
1501 this = rb_next(this);
1502 continue;
1503 }
1504 iput(inode);
1505 } else {
1506 /* Fix the size in place */
1507 err = fix_size_in_place(c, e);
1508 if (err)
1509 return err;
1510 if (e->inode)
1511 iput(e->inode);
1512 }
1513 }
1514 this = rb_next(this);
1515 rb_erase(&e->rb, &c->size_tree);
1516 kfree(e);
1517 }
1518 return 0;
1519}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
new file mode 100644
index 000000000000..7399692af859
--- /dev/null
+++ b/fs/ubifs/replay.c
@@ -0,0 +1,1075 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file contains journal replay code. It runs when the file-system is being
25 * mounted and requires no locking.
26 *
27 * The larger is the journal, the longer it takes to scan it, so the longer it
28 * takes to mount UBIFS. This is why the journal has limited size which may be
29 * changed depending on the system requirements. But a larger journal gives
30 * faster I/O speed because it writes the index less frequently. So this is a
31 * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
32 * larger is the journal, the more memory its index may consume.
33 */
34
35#include "ubifs.h"
36
37/*
38 * Replay flags.
39 *
40 * REPLAY_DELETION: node was deleted
41 * REPLAY_REF: node is a reference node
42 */
43enum {
44 REPLAY_DELETION = 1,
45 REPLAY_REF = 2,
46};
47
48/**
49 * struct replay_entry - replay tree entry.
50 * @lnum: logical eraseblock number of the node
51 * @offs: node offset
52 * @len: node length
53 * @sqnum: node sequence number
54 * @flags: replay flags
55 * @rb: links the replay tree
56 * @key: node key
57 * @nm: directory entry name
58 * @old_size: truncation old size
59 * @new_size: truncation new size
60 * @free: amount of free space in a bud
61 * @dirty: amount of dirty space in a bud from padding and deletion nodes
62 *
63 * UBIFS journal replay must compare node sequence numbers, which means it must
64 * build a tree of node information to insert into the TNC.
65 */
66struct replay_entry {
67 int lnum;
68 int offs;
69 int len;
70 unsigned long long sqnum;
71 int flags;
72 struct rb_node rb;
73 union ubifs_key key;
74 union {
75 struct qstr nm;
76 struct {
77 loff_t old_size;
78 loff_t new_size;
79 };
80 struct {
81 int free;
82 int dirty;
83 };
84 };
85};
86
87/**
88 * struct bud_entry - entry in the list of buds to replay.
89 * @list: next bud in the list
90 * @bud: bud description object
91 * @free: free bytes in the bud
92 * @sqnum: reference node sequence number
93 */
94struct bud_entry {
95 struct list_head list;
96 struct ubifs_bud *bud;
97 int free;
98 unsigned long long sqnum;
99};
100
101/**
102 * set_bud_lprops - set free and dirty space used by a bud.
103 * @c: UBIFS file-system description object
104 * @r: replay entry of bud
105 */
106static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
107{
108 const struct ubifs_lprops *lp;
109 int err = 0, dirty;
110
111 ubifs_get_lprops(c);
112
113 lp = ubifs_lpt_lookup_dirty(c, r->lnum);
114 if (IS_ERR(lp)) {
115 err = PTR_ERR(lp);
116 goto out;
117 }
118
119 dirty = lp->dirty;
120 if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
121 /*
122 * The LEB was added to the journal with a starting offset of
123 * zero which means the LEB must have been empty. The LEB
124 * property values should be lp->free == c->leb_size and
125 * lp->dirty == 0, but that is not the case. The reason is that
126 * the LEB was garbage collected. The garbage collector resets
127 * the free and dirty space without recording it anywhere except
128 * lprops, so if there is not a commit then lprops does not have
129 * that information next time the file system is mounted.
130 *
131 * We do not need to adjust free space because the scan has told
132 * us the exact value which is recorded in the replay entry as
133 * r->free.
134 *
135 * However we do need to subtract from the dirty space the
136 * amount of space that the garbage collector reclaimed, which
137 * is the whole LEB minus the amount of space that was free.
138 */
139 dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
140 lp->free, lp->dirty);
141 dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
142 lp->free, lp->dirty);
143 dirty -= c->leb_size - lp->free;
144 /*
145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads
147 * race with eachother. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay.
150 */
151 if (dirty != 0)
152 dbg_msg("LEB %d lp: %d free %d dirty "
153 "replay: %d free %d dirty", r->lnum, lp->free,
154 lp->dirty, r->free, r->dirty);
155 }
156 lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
157 lp->flags | LPROPS_TAKEN, 0);
158 if (IS_ERR(lp)) {
159 err = PTR_ERR(lp);
160 goto out;
161 }
162out:
163 ubifs_release_lprops(c);
164 return err;
165}
166
167/**
168 * trun_remove_range - apply a replay entry for a truncation to the TNC.
169 * @c: UBIFS file-system description object
170 * @r: replay entry of truncation
171 */
172static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
173{
174 unsigned min_blk, max_blk;
175 union ubifs_key min_key, max_key;
176 ino_t ino;
177
178 min_blk = r->new_size / UBIFS_BLOCK_SIZE;
179 if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
180 min_blk += 1;
181
182 max_blk = r->old_size / UBIFS_BLOCK_SIZE;
183 if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
184 max_blk -= 1;
185
186 ino = key_inum(c, &r->key);
187
188 data_key_init(c, &min_key, ino, min_blk);
189 data_key_init(c, &max_key, ino, max_blk);
190
191 return ubifs_tnc_remove_range(c, &min_key, &max_key);
192}
193
194/**
195 * apply_replay_entry - apply a replay entry to the TNC.
196 * @c: UBIFS file-system description object
197 * @r: replay entry to apply
198 *
199 * Apply a replay entry to the TNC.
200 */
201static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
202{
203 int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
204
205 dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
206 r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
207
208 /* Set c->replay_sqnum to help deal with dangling branches. */
209 c->replay_sqnum = r->sqnum;
210
211 if (r->flags & REPLAY_REF)
212 err = set_bud_lprops(c, r);
213 else if (is_hash_key(c, &r->key)) {
214 if (deletion)
215 err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
216 else
217 err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
218 r->len, &r->nm);
219 } else {
220 if (deletion)
221 switch (key_type(c, &r->key)) {
222 case UBIFS_INO_KEY:
223 {
224 ino_t inum = key_inum(c, &r->key);
225
226 err = ubifs_tnc_remove_ino(c, inum);
227 break;
228 }
229 case UBIFS_TRUN_KEY:
230 err = trun_remove_range(c, r);
231 break;
232 default:
233 err = ubifs_tnc_remove(c, &r->key);
234 break;
235 }
236 else
237 err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
238 r->len);
239 if (err)
240 return err;
241
242 if (c->need_recovery)
243 err = ubifs_recover_size_accum(c, &r->key, deletion,
244 r->new_size);
245 }
246
247 return err;
248}
249
250/**
251 * destroy_replay_tree - destroy the replay.
252 * @c: UBIFS file-system description object
253 *
254 * Destroy the replay tree.
255 */
256static void destroy_replay_tree(struct ubifs_info *c)
257{
258 struct rb_node *this = c->replay_tree.rb_node;
259 struct replay_entry *r;
260
261 while (this) {
262 if (this->rb_left) {
263 this = this->rb_left;
264 continue;
265 } else if (this->rb_right) {
266 this = this->rb_right;
267 continue;
268 }
269 r = rb_entry(this, struct replay_entry, rb);
270 this = rb_parent(this);
271 if (this) {
272 if (this->rb_left == &r->rb)
273 this->rb_left = NULL;
274 else
275 this->rb_right = NULL;
276 }
277 if (is_hash_key(c, &r->key))
278 kfree(r->nm.name);
279 kfree(r);
280 }
281 c->replay_tree = RB_ROOT;
282}
283
284/**
285 * apply_replay_tree - apply the replay tree to the TNC.
286 * @c: UBIFS file-system description object
287 *
288 * Apply the replay tree.
289 * Returns zero in case of success and a negative error code in case of
290 * failure.
291 */
292static int apply_replay_tree(struct ubifs_info *c)
293{
294 struct rb_node *this = rb_first(&c->replay_tree);
295
296 while (this) {
297 struct replay_entry *r;
298 int err;
299
300 cond_resched();
301
302 r = rb_entry(this, struct replay_entry, rb);
303 err = apply_replay_entry(c, r);
304 if (err)
305 return err;
306 this = rb_next(this);
307 }
308 return 0;
309}
310
311/**
312 * insert_node - insert a node to the replay tree.
313 * @c: UBIFS file-system description object
314 * @lnum: node logical eraseblock number
315 * @offs: node offset
316 * @len: node length
317 * @key: node key
318 * @sqnum: sequence number
319 * @deletion: non-zero if this is a deletion
320 * @used: number of bytes in use in a LEB
321 * @old_size: truncation old size
322 * @new_size: truncation new size
323 *
324 * This function inserts a scanned non-direntry node to the replay tree. The
325 * replay tree is an RB-tree containing @struct replay_entry elements which are
326 * indexed by the sequence number. The replay tree is applied at the very end
327 * of the replay process. Since the tree is sorted in sequence number order,
328 * the older modifications are applied first. This function returns zero in
329 * case of success and a negative error code in case of failure.
330 */
331static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
332 union ubifs_key *key, unsigned long long sqnum,
333 int deletion, int *used, loff_t old_size,
334 loff_t new_size)
335{
336 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
337 struct replay_entry *r;
338
339 if (key_inum(c, key) >= c->highest_inum)
340 c->highest_inum = key_inum(c, key);
341
342 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
343 while (*p) {
344 parent = *p;
345 r = rb_entry(parent, struct replay_entry, rb);
346 if (sqnum < r->sqnum) {
347 p = &(*p)->rb_left;
348 continue;
349 } else if (sqnum > r->sqnum) {
350 p = &(*p)->rb_right;
351 continue;
352 }
353 ubifs_err("duplicate sqnum in replay");
354 return -EINVAL;
355 }
356
357 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
358 if (!r)
359 return -ENOMEM;
360
361 if (!deletion)
362 *used += ALIGN(len, 8);
363 r->lnum = lnum;
364 r->offs = offs;
365 r->len = len;
366 r->sqnum = sqnum;
367 r->flags = (deletion ? REPLAY_DELETION : 0);
368 r->old_size = old_size;
369 r->new_size = new_size;
370 key_copy(c, key, &r->key);
371
372 rb_link_node(&r->rb, parent, p);
373 rb_insert_color(&r->rb, &c->replay_tree);
374 return 0;
375}
376
377/**
378 * insert_dent - insert a directory entry node into the replay tree.
379 * @c: UBIFS file-system description object
380 * @lnum: node logical eraseblock number
381 * @offs: node offset
382 * @len: node length
383 * @key: node key
384 * @name: directory entry name
385 * @nlen: directory entry name length
386 * @sqnum: sequence number
387 * @deletion: non-zero if this is a deletion
388 * @used: number of bytes in use in a LEB
389 *
390 * This function inserts a scanned directory entry node to the replay tree.
391 * Returns zero in case of success and a negative error code in case of
392 * failure.
393 *
394 * This function is also used for extended attribute entries because they are
395 * implemented as directory entry nodes.
396 */
397static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
398 union ubifs_key *key, const char *name, int nlen,
399 unsigned long long sqnum, int deletion, int *used)
400{
401 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
402 struct replay_entry *r;
403 char *nbuf;
404
405 if (key_inum(c, key) >= c->highest_inum)
406 c->highest_inum = key_inum(c, key);
407
408 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
409 while (*p) {
410 parent = *p;
411 r = rb_entry(parent, struct replay_entry, rb);
412 if (sqnum < r->sqnum) {
413 p = &(*p)->rb_left;
414 continue;
415 }
416 if (sqnum > r->sqnum) {
417 p = &(*p)->rb_right;
418 continue;
419 }
420 ubifs_err("duplicate sqnum in replay");
421 return -EINVAL;
422 }
423
424 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
425 if (!r)
426 return -ENOMEM;
427 nbuf = kmalloc(nlen + 1, GFP_KERNEL);
428 if (!nbuf) {
429 kfree(r);
430 return -ENOMEM;
431 }
432
433 if (!deletion)
434 *used += ALIGN(len, 8);
435 r->lnum = lnum;
436 r->offs = offs;
437 r->len = len;
438 r->sqnum = sqnum;
439 r->nm.len = nlen;
440 memcpy(nbuf, name, nlen);
441 nbuf[nlen] = '\0';
442 r->nm.name = nbuf;
443 r->flags = (deletion ? REPLAY_DELETION : 0);
444 key_copy(c, key, &r->key);
445
446 ubifs_assert(!*p);
447 rb_link_node(&r->rb, parent, p);
448 rb_insert_color(&r->rb, &c->replay_tree);
449 return 0;
450}
451
452/**
453 * ubifs_validate_entry - validate directory or extended attribute entry node.
454 * @c: UBIFS file-system description object
455 * @dent: the node to validate
456 *
457 * This function validates directory or extended attribute entry node @dent.
458 * Returns zero if the node is all right and a %-EINVAL if not.
459 */
460int ubifs_validate_entry(struct ubifs_info *c,
461 const struct ubifs_dent_node *dent)
462{
463 int key_type = key_type_flash(c, dent->key);
464 int nlen = le16_to_cpu(dent->nlen);
465
466 if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
467 dent->type >= UBIFS_ITYPES_CNT ||
468 nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
469 strnlen(dent->name, nlen) != nlen ||
470 le64_to_cpu(dent->inum) > MAX_INUM) {
471 ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
472 "directory entry" : "extended attribute entry");
473 return -EINVAL;
474 }
475
476 if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
477 ubifs_err("bad key type %d", key_type);
478 return -EINVAL;
479 }
480
481 return 0;
482}
483
484/**
485 * replay_bud - replay a bud logical eraseblock.
486 * @c: UBIFS file-system description object
487 * @lnum: bud logical eraseblock number to replay
488 * @offs: bud start offset
489 * @jhead: journal head to which this bud belongs
490 * @free: amount of free space in the bud is returned here
491 * @dirty: amount of dirty space from padding and deletion nodes is returned
492 * here
493 *
494 * This function returns zero in case of success and a negative error code in
495 * case of failure.
496 */
497static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
498 int *free, int *dirty)
499{
500 int err = 0, used = 0;
501 struct ubifs_scan_leb *sleb;
502 struct ubifs_scan_node *snod;
503 struct ubifs_bud *bud;
504
505 dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
506 if (c->need_recovery)
507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
508 else
509 sleb = ubifs_scan(c, lnum, offs, c->sbuf);
510 if (IS_ERR(sleb))
511 return PTR_ERR(sleb);
512
513 /*
514 * The bud does not have to start from offset zero - the beginning of
515 * the 'lnum' LEB may contain previously committed data. One of the
516 * things we have to do in replay is to correctly update lprops with
517 * newer information about this LEB.
518 *
519 * At this point lprops thinks that this LEB has 'c->leb_size - offs'
520 * bytes of free space because it only contain information about
521 * committed data.
522 *
523 * But we know that real amount of free space is 'c->leb_size -
524 * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
525 * 'sleb->endpt' is used by bud data. We have to correctly calculate
526 * how much of these data are dirty and update lprops with this
527 * information.
528 *
529 * The dirt in that LEB region is comprised of padding nodes, deletion
530 * nodes, truncation nodes and nodes which are obsoleted by subsequent
531 * nodes in this LEB. So instead of calculating clean space, we
532 * calculate used space ('used' variable).
533 */
534
535 list_for_each_entry(snod, &sleb->nodes, list) {
536 int deletion = 0;
537
538 cond_resched();
539
540 if (snod->sqnum >= SQNUM_WATERMARK) {
541 ubifs_err("file system's life ended");
542 goto out_dump;
543 }
544
545 if (snod->sqnum > c->max_sqnum)
546 c->max_sqnum = snod->sqnum;
547
548 switch (snod->type) {
549 case UBIFS_INO_NODE:
550 {
551 struct ubifs_ino_node *ino = snod->node;
552 loff_t new_size = le64_to_cpu(ino->size);
553
554 if (le32_to_cpu(ino->nlink) == 0)
555 deletion = 1;
556 err = insert_node(c, lnum, snod->offs, snod->len,
557 &snod->key, snod->sqnum, deletion,
558 &used, 0, new_size);
559 break;
560 }
561 case UBIFS_DATA_NODE:
562 {
563 struct ubifs_data_node *dn = snod->node;
564 loff_t new_size = le32_to_cpu(dn->size) +
565 key_block(c, &snod->key) *
566 UBIFS_BLOCK_SIZE;
567
568 err = insert_node(c, lnum, snod->offs, snod->len,
569 &snod->key, snod->sqnum, deletion,
570 &used, 0, new_size);
571 break;
572 }
573 case UBIFS_DENT_NODE:
574 case UBIFS_XENT_NODE:
575 {
576 struct ubifs_dent_node *dent = snod->node;
577
578 err = ubifs_validate_entry(c, dent);
579 if (err)
580 goto out_dump;
581
582 err = insert_dent(c, lnum, snod->offs, snod->len,
583 &snod->key, dent->name,
584 le16_to_cpu(dent->nlen), snod->sqnum,
585 !le64_to_cpu(dent->inum), &used);
586 break;
587 }
588 case UBIFS_TRUN_NODE:
589 {
590 struct ubifs_trun_node *trun = snod->node;
591 loff_t old_size = le64_to_cpu(trun->old_size);
592 loff_t new_size = le64_to_cpu(trun->new_size);
593 union ubifs_key key;
594
595 /* Validate truncation node */
596 if (old_size < 0 || old_size > c->max_inode_sz ||
597 new_size < 0 || new_size > c->max_inode_sz ||
598 old_size <= new_size) {
599 ubifs_err("bad truncation node");
600 goto out_dump;
601 }
602
603 /*
604 * Create a fake truncation key just to use the same
605 * functions which expect nodes to have keys.
606 */
607 trun_key_init(c, &key, le32_to_cpu(trun->inum));
608 err = insert_node(c, lnum, snod->offs, snod->len,
609 &key, snod->sqnum, 1, &used,
610 old_size, new_size);
611 break;
612 }
613 default:
614 ubifs_err("unexpected node type %d in bud LEB %d:%d",
615 snod->type, lnum, snod->offs);
616 err = -EINVAL;
617 goto out_dump;
618 }
619 if (err)
620 goto out;
621 }
622
623 bud = ubifs_search_bud(c, lnum);
624 if (!bud)
625 BUG();
626
627 ubifs_assert(sleb->endpt - offs >= used);
628 ubifs_assert(sleb->endpt % c->min_io_size == 0);
629
630 if (sleb->endpt + c->min_io_size <= c->leb_size &&
631 !(c->vfs_sb->s_flags & MS_RDONLY))
632 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
633 sleb->endpt, UBI_SHORTTERM);
634
635 *dirty = sleb->endpt - offs - used;
636 *free = c->leb_size - sleb->endpt;
637
638out:
639 ubifs_scan_destroy(sleb);
640 return err;
641
642out_dump:
643 ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
644 dbg_dump_node(c, snod->node);
645 ubifs_scan_destroy(sleb);
646 return -EINVAL;
647}
648
649/**
650 * insert_ref_node - insert a reference node to the replay tree.
651 * @c: UBIFS file-system description object
652 * @lnum: node logical eraseblock number
653 * @offs: node offset
654 * @sqnum: sequence number
655 * @free: amount of free space in bud
656 * @dirty: amount of dirty space from padding and deletion nodes
657 *
658 * This function inserts a reference node to the replay tree and returns zero
659 * in case of success ort a negative error code in case of failure.
660 */
661static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
662 unsigned long long sqnum, int free, int dirty)
663{
664 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
665 struct replay_entry *r;
666
667 dbg_mnt("add ref LEB %d:%d", lnum, offs);
668 while (*p) {
669 parent = *p;
670 r = rb_entry(parent, struct replay_entry, rb);
671 if (sqnum < r->sqnum) {
672 p = &(*p)->rb_left;
673 continue;
674 } else if (sqnum > r->sqnum) {
675 p = &(*p)->rb_right;
676 continue;
677 }
678 ubifs_err("duplicate sqnum in replay tree");
679 return -EINVAL;
680 }
681
682 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
683 if (!r)
684 return -ENOMEM;
685
686 r->lnum = lnum;
687 r->offs = offs;
688 r->sqnum = sqnum;
689 r->flags = REPLAY_REF;
690 r->free = free;
691 r->dirty = dirty;
692
693 rb_link_node(&r->rb, parent, p);
694 rb_insert_color(&r->rb, &c->replay_tree);
695 return 0;
696}
697
698/**
699 * replay_buds - replay all buds.
700 * @c: UBIFS file-system description object
701 *
702 * This function returns zero in case of success and a negative error code in
703 * case of failure.
704 */
705static int replay_buds(struct ubifs_info *c)
706{
707 struct bud_entry *b;
708 int err, uninitialized_var(free), uninitialized_var(dirty);
709
710 list_for_each_entry(b, &c->replay_buds, list) {
711 err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
712 &free, &dirty);
713 if (err)
714 return err;
715 err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
716 free, dirty);
717 if (err)
718 return err;
719 }
720
721 return 0;
722}
723
724/**
725 * destroy_bud_list - destroy the list of buds to replay.
726 * @c: UBIFS file-system description object
727 */
728static void destroy_bud_list(struct ubifs_info *c)
729{
730 struct bud_entry *b;
731
732 while (!list_empty(&c->replay_buds)) {
733 b = list_entry(c->replay_buds.next, struct bud_entry, list);
734 list_del(&b->list);
735 kfree(b);
736 }
737}
738
739/**
740 * add_replay_bud - add a bud to the list of buds to replay.
741 * @c: UBIFS file-system description object
742 * @lnum: bud logical eraseblock number to replay
743 * @offs: bud start offset
744 * @jhead: journal head to which this bud belongs
745 * @sqnum: reference node sequence number
746 *
747 * This function returns zero in case of success and a negative error code in
748 * case of failure.
749 */
750static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
751 unsigned long long sqnum)
752{
753 struct ubifs_bud *bud;
754 struct bud_entry *b;
755
756 dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
757
758 bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
759 if (!bud)
760 return -ENOMEM;
761
762 b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
763 if (!b) {
764 kfree(bud);
765 return -ENOMEM;
766 }
767
768 bud->lnum = lnum;
769 bud->start = offs;
770 bud->jhead = jhead;
771 ubifs_add_bud(c, bud);
772
773 b->bud = bud;
774 b->sqnum = sqnum;
775 list_add_tail(&b->list, &c->replay_buds);
776
777 return 0;
778}
779
780/**
781 * validate_ref - validate a reference node.
782 * @c: UBIFS file-system description object
783 * @ref: the reference node to validate
784 * @ref_lnum: LEB number of the reference node
785 * @ref_offs: reference node offset
786 *
787 * This function returns %1 if a bud reference already exists for the LEB. %0 is
788 * returned if the reference node is new, otherwise %-EINVAL is returned if
789 * validation failed.
790 */
791static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
792{
793 struct ubifs_bud *bud;
794 int lnum = le32_to_cpu(ref->lnum);
795 unsigned int offs = le32_to_cpu(ref->offs);
796 unsigned int jhead = le32_to_cpu(ref->jhead);
797
798 /*
799 * ref->offs may point to the end of LEB when the journal head points
800 * to the end of LEB and we write reference node for it during commit.
801 * So this is why we require 'offs > c->leb_size'.
802 */
803 if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
804 lnum < c->main_first || offs > c->leb_size ||
805 offs & (c->min_io_size - 1))
806 return -EINVAL;
807
808 /* Make sure we have not already looked at this bud */
809 bud = ubifs_search_bud(c, lnum);
810 if (bud) {
811 if (bud->jhead == jhead && bud->start <= offs)
812 return 1;
813 ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
814 return -EINVAL;
815 }
816
817 return 0;
818}
819
820/**
821 * replay_log_leb - replay a log logical eraseblock.
822 * @c: UBIFS file-system description object
823 * @lnum: log logical eraseblock to replay
824 * @offs: offset to start replaying from
825 * @sbuf: scan buffer
826 *
827 * This function replays a log LEB and returns zero in case of success, %1 if
828 * this is the last LEB in the log, and a negative error code in case of
829 * failure.
830 */
831static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
832{
833 int err;
834 struct ubifs_scan_leb *sleb;
835 struct ubifs_scan_node *snod;
836 const struct ubifs_cs_node *node;
837
838 dbg_mnt("replay log LEB %d:%d", lnum, offs);
839 sleb = ubifs_scan(c, lnum, offs, sbuf);
840 if (IS_ERR(sleb)) {
841 if (c->need_recovery)
842 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
843 if (IS_ERR(sleb))
844 return PTR_ERR(sleb);
845 }
846
847 if (sleb->nodes_cnt == 0) {
848 err = 1;
849 goto out;
850 }
851
852 node = sleb->buf;
853
854 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
855 if (c->cs_sqnum == 0) {
856 /*
857 * This is the first log LEB we are looking at, make sure that
858 * the first node is a commit start node. Also record its
859 * sequence number so that UBIFS can determine where the log
860 * ends, because all nodes which were have higher sequence
861 * numbers.
862 */
863 if (snod->type != UBIFS_CS_NODE) {
864 dbg_err("first log node at LEB %d:%d is not CS node",
865 lnum, offs);
866 goto out_dump;
867 }
868 if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
869 dbg_err("first CS node at LEB %d:%d has wrong "
870 "commit number %llu expected %llu",
871 lnum, offs,
872 (unsigned long long)le64_to_cpu(node->cmt_no),
873 c->cmt_no);
874 goto out_dump;
875 }
876
877 c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
878 dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
879 }
880
881 if (snod->sqnum < c->cs_sqnum) {
882 /*
883 * This means that we reached end of log and now
884 * look to the older log data, which was already
885 * committed but the eraseblock was not erased (UBIFS
886 * only unmaps it). So this basically means we have to
887 * exit with "end of log" code.
888 */
889 err = 1;
890 goto out;
891 }
892
893 /* Make sure the first node sits at offset zero of the LEB */
894 if (snod->offs != 0) {
895 dbg_err("first node is not at zero offset");
896 goto out_dump;
897 }
898
899 list_for_each_entry(snod, &sleb->nodes, list) {
900
901 cond_resched();
902
903 if (snod->sqnum >= SQNUM_WATERMARK) {
904 ubifs_err("file system's life ended");
905 goto out_dump;
906 }
907
908 if (snod->sqnum < c->cs_sqnum) {
909 dbg_err("bad sqnum %llu, commit sqnum %llu",
910 snod->sqnum, c->cs_sqnum);
911 goto out_dump;
912 }
913
914 if (snod->sqnum > c->max_sqnum)
915 c->max_sqnum = snod->sqnum;
916
917 switch (snod->type) {
918 case UBIFS_REF_NODE: {
919 const struct ubifs_ref_node *ref = snod->node;
920
921 err = validate_ref(c, ref);
922 if (err == 1)
923 break; /* Already have this bud */
924 if (err)
925 goto out_dump;
926
927 err = add_replay_bud(c, le32_to_cpu(ref->lnum),
928 le32_to_cpu(ref->offs),
929 le32_to_cpu(ref->jhead),
930 snod->sqnum);
931 if (err)
932 goto out;
933
934 break;
935 }
936 case UBIFS_CS_NODE:
937 /* Make sure it sits at the beginning of LEB */
938 if (snod->offs != 0) {
939 ubifs_err("unexpected node in log");
940 goto out_dump;
941 }
942 break;
943 default:
944 ubifs_err("unexpected node in log");
945 goto out_dump;
946 }
947 }
948
949 if (sleb->endpt || c->lhead_offs >= c->leb_size) {
950 c->lhead_lnum = lnum;
951 c->lhead_offs = sleb->endpt;
952 }
953
954 err = !sleb->endpt;
955out:
956 ubifs_scan_destroy(sleb);
957 return err;
958
959out_dump:
960 ubifs_err("log error detected while replying the log at LEB %d:%d",
961 lnum, offs + snod->offs);
962 dbg_dump_node(c, snod->node);
963 ubifs_scan_destroy(sleb);
964 return -EINVAL;
965}
966
967/**
968 * take_ihead - update the status of the index head in lprops to 'taken'.
969 * @c: UBIFS file-system description object
970 *
971 * This function returns the amount of free space in the index head LEB or a
972 * negative error code.
973 */
974static int take_ihead(struct ubifs_info *c)
975{
976 const struct ubifs_lprops *lp;
977 int err, free;
978
979 ubifs_get_lprops(c);
980
981 lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
982 if (IS_ERR(lp)) {
983 err = PTR_ERR(lp);
984 goto out;
985 }
986
987 free = lp->free;
988
989 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
990 lp->flags | LPROPS_TAKEN, 0);
991 if (IS_ERR(lp)) {
992 err = PTR_ERR(lp);
993 goto out;
994 }
995
996 err = free;
997out:
998 ubifs_release_lprops(c);
999 return err;
1000}
1001
1002/**
1003 * ubifs_replay_journal - replay journal.
1004 * @c: UBIFS file-system description object
1005 *
1006 * This function scans the journal, replays and cleans it up. It makes sure all
1007 * memory data structures related to uncommitted journal are built (dirty TNC
1008 * tree, tree of buds, modified lprops, etc).
1009 */
1010int ubifs_replay_journal(struct ubifs_info *c)
1011{
1012 int err, i, lnum, offs, free;
1013 void *sbuf = NULL;
1014
1015 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
1016
1017 /* Update the status of the index head in lprops to 'taken' */
1018 free = take_ihead(c);
1019 if (free < 0)
1020 return free; /* Error code */
1021
1022 if (c->ihead_offs != c->leb_size - free) {
1023 ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
1024 c->ihead_offs);
1025 return -EINVAL;
1026 }
1027
1028 sbuf = vmalloc(c->leb_size);
1029 if (!sbuf)
1030 return -ENOMEM;
1031
1032 dbg_mnt("start replaying the journal");
1033
1034 c->replaying = 1;
1035
1036 lnum = c->ltail_lnum = c->lhead_lnum;
1037 offs = c->lhead_offs;
1038
1039 for (i = 0; i < c->log_lebs; i++, lnum++) {
1040 if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
1041 /*
1042 * The log is logically circular, we reached the last
1043 * LEB, switch to the first one.
1044 */
1045 lnum = UBIFS_LOG_LNUM;
1046 offs = 0;
1047 }
1048 err = replay_log_leb(c, lnum, offs, sbuf);
1049 if (err == 1)
1050 /* We hit the end of the log */
1051 break;
1052 if (err)
1053 goto out;
1054 offs = 0;
1055 }
1056
1057 err = replay_buds(c);
1058 if (err)
1059 goto out;
1060
1061 err = apply_replay_tree(c);
1062 if (err)
1063 goto out;
1064
1065 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1066 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
1067 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
1068 c->highest_inum);
1069out:
1070 destroy_replay_tree(c);
1071 destroy_bud_list(c);
1072 vfree(sbuf);
1073 c->replaying = 0;
1074 return err;
1075}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
new file mode 100644
index 000000000000..2bf753b38889
--- /dev/null
+++ b/fs/ubifs/sb.c
@@ -0,0 +1,629 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS superblock. The superblock is stored at the first
25 * LEB of the volume and is never changed by UBIFS. Only user-space tools may
26 * change it. The superblock node mostly contains geometry information.
27 */
28
29#include "ubifs.h"
30#include <linux/random.h>
31
32/*
33 * Default journal size in logical eraseblocks as a percent of total
34 * flash size.
35 */
36#define DEFAULT_JNL_PERCENT 5
37
38/* Default maximum journal size in bytes */
39#define DEFAULT_MAX_JNL (32*1024*1024)
40
41/* Default indexing tree fanout */
42#define DEFAULT_FANOUT 8
43
44/* Default number of data journal heads */
45#define DEFAULT_JHEADS_CNT 1
46
47/* Default positions of different LEBs in the main area */
48#define DEFAULT_IDX_LEB 0
49#define DEFAULT_DATA_LEB 1
50#define DEFAULT_GC_LEB 2
51
52/* Default number of LEB numbers in LPT's save table */
53#define DEFAULT_LSAVE_CNT 256
54
55/* Default reserved pool size as a percent of maximum free space */
56#define DEFAULT_RP_PERCENT 5
57
58/* The default maximum size of reserved pool in bytes */
59#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
60
61/* Default time granularity in nanoseconds */
62#define DEFAULT_TIME_GRAN 1000000000
63
64/**
65 * create_default_filesystem - format empty UBI volume.
66 * @c: UBIFS file-system description object
67 *
68 * This function creates default empty file-system. Returns zero in case of
69 * success and a negative error code in case of failure.
70 */
71static int create_default_filesystem(struct ubifs_info *c)
72{
73 struct ubifs_sb_node *sup;
74 struct ubifs_mst_node *mst;
75 struct ubifs_idx_node *idx;
76 struct ubifs_branch *br;
77 struct ubifs_ino_node *ino;
78 struct ubifs_cs_node *cs;
79 union ubifs_key key;
80 int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
81 int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
82 int min_leb_cnt = UBIFS_MIN_LEB_CNT;
83 uint64_t tmp64, main_bytes;
84
85 /* Some functions called from here depend on the @c->key_len filed */
86 c->key_len = UBIFS_SK_LEN;
87
88 /*
89 * First of all, we have to calculate default file-system geometry -
90 * log size, journal size, etc.
91 */
92 if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
93 /* We can first multiply then divide and have no overflow */
94 jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
95 else
96 jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
97
98 if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
99 jnl_lebs = UBIFS_MIN_JNL_LEBS;
100 if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
101 jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
102
103 /*
104 * The log should be large enough to fit reference nodes for all bud
105 * LEBs. Because buds do not have to start from the beginning of LEBs
106 * (half of the LEB may contain committed data), the log should
107 * generally be larger, make it twice as large.
108 */
109 tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
110 log_lebs = tmp / c->leb_size;
111 /* Plus one LEB reserved for commit */
112 log_lebs += 1;
113 if (c->leb_cnt - min_leb_cnt > 8) {
114 /* And some extra space to allow writes while committing */
115 log_lebs += 1;
116 min_leb_cnt += 1;
117 }
118
119 max_buds = jnl_lebs - log_lebs;
120 if (max_buds < UBIFS_MIN_BUD_LEBS)
121 max_buds = UBIFS_MIN_BUD_LEBS;
122
123 /*
124 * Orphan nodes are stored in a separate area. One node can store a lot
125 * of orphan inode numbers, but when new orphan comes we just add a new
126 * orphan node. At some point the nodes are consolidated into one
127 * orphan node.
128 */
129 orph_lebs = UBIFS_MIN_ORPH_LEBS;
130#ifdef CONFIG_UBIFS_FS_DEBUG
131 if (c->leb_cnt - min_leb_cnt > 1)
132 /*
133 * For debugging purposes it is better to have at least 2
134 * orphan LEBs, because the orphan subsystem would need to do
135 * consolidations and would be stressed more.
136 */
137 orph_lebs += 1;
138#endif
139
140 main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
141 main_lebs -= orph_lebs;
142
143 lpt_first = UBIFS_LOG_LNUM + log_lebs;
144 c->lsave_cnt = DEFAULT_LSAVE_CNT;
145 c->max_leb_cnt = c->leb_cnt;
146 err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
147 &big_lpt);
148 if (err)
149 return err;
150
151 dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
152 lpt_first + lpt_lebs - 1);
153
154 main_first = c->leb_cnt - main_lebs;
155
156 /* Create default superblock */
157 tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
158 sup = kzalloc(tmp, GFP_KERNEL);
159 if (!sup)
160 return -ENOMEM;
161
162 tmp64 = (uint64_t)max_buds * c->leb_size;
163 if (big_lpt)
164 sup_flags |= UBIFS_FLG_BIGLPT;
165
166 sup->ch.node_type = UBIFS_SB_NODE;
167 sup->key_hash = UBIFS_KEY_HASH_R5;
168 sup->flags = cpu_to_le32(sup_flags);
169 sup->min_io_size = cpu_to_le32(c->min_io_size);
170 sup->leb_size = cpu_to_le32(c->leb_size);
171 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
172 sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt);
173 sup->max_bud_bytes = cpu_to_le64(tmp64);
174 sup->log_lebs = cpu_to_le32(log_lebs);
175 sup->lpt_lebs = cpu_to_le32(lpt_lebs);
176 sup->orph_lebs = cpu_to_le32(orph_lebs);
177 sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT);
178 sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
179 sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
180 sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
181 sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
182 sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
183
184 generate_random_uuid(sup->uuid);
185
186 main_bytes = (uint64_t)main_lebs * c->leb_size;
187 tmp64 = main_bytes * DEFAULT_RP_PERCENT;
188 do_div(tmp64, 100);
189 if (tmp64 > DEFAULT_MAX_RP_SIZE)
190 tmp64 = DEFAULT_MAX_RP_SIZE;
191 sup->rp_size = cpu_to_le64(tmp64);
192
193 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
194 kfree(sup);
195 if (err)
196 return err;
197
198 dbg_gen("default superblock created at LEB 0:0");
199
200 /* Create default master node */
201 mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
202 if (!mst)
203 return -ENOMEM;
204
205 mst->ch.node_type = UBIFS_MST_NODE;
206 mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM);
207 mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
208 mst->cmt_no = 0;
209 mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
210 mst->root_offs = 0;
211 tmp = ubifs_idx_node_sz(c, 1);
212 mst->root_len = cpu_to_le32(tmp);
213 mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB);
214 mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
215 mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size));
216 mst->index_size = cpu_to_le64(ALIGN(tmp, 8));
217 mst->lpt_lnum = cpu_to_le32(c->lpt_lnum);
218 mst->lpt_offs = cpu_to_le32(c->lpt_offs);
219 mst->nhead_lnum = cpu_to_le32(c->nhead_lnum);
220 mst->nhead_offs = cpu_to_le32(c->nhead_offs);
221 mst->ltab_lnum = cpu_to_le32(c->ltab_lnum);
222 mst->ltab_offs = cpu_to_le32(c->ltab_offs);
223 mst->lsave_lnum = cpu_to_le32(c->lsave_lnum);
224 mst->lsave_offs = cpu_to_le32(c->lsave_offs);
225 mst->lscan_lnum = cpu_to_le32(main_first);
226 mst->empty_lebs = cpu_to_le32(main_lebs - 2);
227 mst->idx_lebs = cpu_to_le32(1);
228 mst->leb_cnt = cpu_to_le32(c->leb_cnt);
229
230 /* Calculate lprops statistics */
231 tmp64 = main_bytes;
232 tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
233 tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
234 mst->total_free = cpu_to_le64(tmp64);
235
236 tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
237 ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
238 UBIFS_INO_NODE_SZ;
239 tmp64 += ino_waste;
240 tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
241 mst->total_dirty = cpu_to_le64(tmp64);
242
243 /* The indexing LEB does not contribute to dark space */
244 tmp64 = (c->main_lebs - 1) * c->dark_wm;
245 mst->total_dark = cpu_to_le64(tmp64);
246
247 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
248
249 err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
250 UBI_UNKNOWN);
251 if (err) {
252 kfree(mst);
253 return err;
254 }
255 err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
256 UBI_UNKNOWN);
257 kfree(mst);
258 if (err)
259 return err;
260
261 dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
262
263 /* Create the root indexing node */
264 tmp = ubifs_idx_node_sz(c, 1);
265 idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
266 if (!idx)
267 return -ENOMEM;
268
269 c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
270 c->key_hash = key_r5_hash;
271
272 idx->ch.node_type = UBIFS_IDX_NODE;
273 idx->child_cnt = cpu_to_le16(1);
274 ino_key_init(c, &key, UBIFS_ROOT_INO);
275 br = ubifs_idx_branch(c, idx, 0);
276 key_write_idx(c, &key, &br->key);
277 br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
278 br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);
279 err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
280 UBI_UNKNOWN);
281 kfree(idx);
282 if (err)
283 return err;
284
285 dbg_gen("default root indexing node created LEB %d:0",
286 main_first + DEFAULT_IDX_LEB);
287
288 /* Create default root inode */
289 tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
290 ino = kzalloc(tmp, GFP_KERNEL);
291 if (!ino)
292 return -ENOMEM;
293
294 ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
295 ino->ch.node_type = UBIFS_INO_NODE;
296 ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
297 ino->nlink = cpu_to_le32(2);
298 tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
299 ino->atime_sec = tmp;
300 ino->ctime_sec = tmp;
301 ino->mtime_sec = tmp;
302 ino->atime_nsec = 0;
303 ino->ctime_nsec = 0;
304 ino->mtime_nsec = 0;
305 ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
306 ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
307
308 /* Set compression enabled by default */
309 ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
310
311 err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
312 main_first + DEFAULT_DATA_LEB, 0,
313 UBI_UNKNOWN);
314 kfree(ino);
315 if (err)
316 return err;
317
318 dbg_gen("root inode created at LEB %d:0",
319 main_first + DEFAULT_DATA_LEB);
320
321 /*
322 * The first node in the log has to be the commit start node. This is
323 * always the case during normal file-system operation. Write a fake
324 * commit start node to the log.
325 */
326 tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
327 cs = kzalloc(tmp, GFP_KERNEL);
328 if (!cs)
329 return -ENOMEM;
330
331 cs->ch.node_type = UBIFS_CS_NODE;
332 err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
333 0, UBI_UNKNOWN);
334 kfree(cs);
335
336 ubifs_msg("default file-system created");
337 return 0;
338}
339
340/**
341 * validate_sb - validate superblock node.
342 * @c: UBIFS file-system description object
343 * @sup: superblock node
344 *
345 * This function validates superblock node @sup. Since most of data was read
346 * from the superblock and stored in @c, the function validates fields in @c
347 * instead. Returns zero in case of success and %-EINVAL in case of validation
348 * failure.
349 */
350static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
351{
352 long long max_bytes;
353 int err = 1, min_leb_cnt;
354
355 if (!c->key_hash) {
356 err = 2;
357 goto failed;
358 }
359
360 if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
361 err = 3;
362 goto failed;
363 }
364
365 if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
366 ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
367 le32_to_cpu(sup->min_io_size), c->min_io_size);
368 goto failed;
369 }
370
371 if (le32_to_cpu(sup->leb_size) != c->leb_size) {
372 ubifs_err("LEB size mismatch: %d in superblock, %d real",
373 le32_to_cpu(sup->leb_size), c->leb_size);
374 goto failed;
375 }
376
377 if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
378 c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
379 c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
380 c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
381 err = 4;
382 goto failed;
383 }
384
385 /*
386 * Calculate minimum allowed amount of main area LEBs. This is very
387 * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
388 * have just read from the superblock.
389 */
390 min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
391 min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
392
393 if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
394 ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
395 "%d minimum required", c->leb_cnt, c->vi.size,
396 min_leb_cnt);
397 goto failed;
398 }
399
400 if (c->max_leb_cnt < c->leb_cnt) {
401 ubifs_err("max. LEB count %d less than LEB count %d",
402 c->max_leb_cnt, c->leb_cnt);
403 goto failed;
404 }
405
406 if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
407 err = 7;
408 goto failed;
409 }
410
411 if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
412 c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
413 err = 8;
414 goto failed;
415 }
416
417 if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
418 c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
419 err = 9;
420 goto failed;
421 }
422
423 if (c->fanout < UBIFS_MIN_FANOUT ||
424 ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
425 err = 10;
426 goto failed;
427 }
428
429 if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
430 c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
431 c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
432 err = 11;
433 goto failed;
434 }
435
436 if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
437 c->orph_lebs + c->main_lebs != c->leb_cnt) {
438 err = 12;
439 goto failed;
440 }
441
442 if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
443 err = 13;
444 goto failed;
445 }
446
447 max_bytes = c->main_lebs * (long long)c->leb_size;
448 if (c->rp_size < 0 || max_bytes < c->rp_size) {
449 err = 14;
450 goto failed;
451 }
452
453 if (le32_to_cpu(sup->time_gran) > 1000000000 ||
454 le32_to_cpu(sup->time_gran) < 1) {
455 err = 15;
456 goto failed;
457 }
458
459 return 0;
460
461failed:
462 ubifs_err("bad superblock, error %d", err);
463 dbg_dump_node(c, sup);
464 return -EINVAL;
465}
466
467/**
468 * ubifs_read_sb_node - read superblock node.
469 * @c: UBIFS file-system description object
470 *
471 * This function returns a pointer to the superblock node or a negative error
472 * code.
473 */
474struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
475{
476 struct ubifs_sb_node *sup;
477 int err;
478
479 sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
480 if (!sup)
481 return ERR_PTR(-ENOMEM);
482
483 err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
484 UBIFS_SB_LNUM, 0);
485 if (err) {
486 kfree(sup);
487 return ERR_PTR(err);
488 }
489
490 return sup;
491}
492
493/**
494 * ubifs_write_sb_node - write superblock node.
495 * @c: UBIFS file-system description object
496 * @sup: superblock node read with 'ubifs_read_sb_node()'
497 *
498 * This function returns %0 on success and a negative error code on failure.
499 */
500int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
501{
502 int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
503
504 ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
505 return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
506}
507
508/**
509 * ubifs_read_superblock - read superblock.
510 * @c: UBIFS file-system description object
511 *
512 * This function finds, reads and checks the superblock. If an empty UBI volume
513 * is being mounted, this function creates default superblock. Returns zero in
514 * case of success, and a negative error code in case of failure.
515 */
516int ubifs_read_superblock(struct ubifs_info *c)
517{
518 int err, sup_flags;
519 struct ubifs_sb_node *sup;
520
521 if (c->empty) {
522 err = create_default_filesystem(c);
523 if (err)
524 return err;
525 }
526
527 sup = ubifs_read_sb_node(c);
528 if (IS_ERR(sup))
529 return PTR_ERR(sup);
530
531 /*
532 * The software supports all previous versions but not future versions,
533 * due to the unavailability of time-travelling equipment.
534 */
535 c->fmt_version = le32_to_cpu(sup->fmt_version);
536 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
537 ubifs_err("on-flash format version is %d, but software only "
538 "supports up to version %d", c->fmt_version,
539 UBIFS_FORMAT_VERSION);
540 err = -EINVAL;
541 goto out;
542 }
543
544 if (c->fmt_version < 3) {
545 ubifs_err("on-flash format version %d is not supported",
546 c->fmt_version);
547 err = -EINVAL;
548 goto out;
549 }
550
551 switch (sup->key_hash) {
552 case UBIFS_KEY_HASH_R5:
553 c->key_hash = key_r5_hash;
554 c->key_hash_type = UBIFS_KEY_HASH_R5;
555 break;
556
557 case UBIFS_KEY_HASH_TEST:
558 c->key_hash = key_test_hash;
559 c->key_hash_type = UBIFS_KEY_HASH_TEST;
560 break;
561 };
562
563 c->key_fmt = sup->key_fmt;
564
565 switch (c->key_fmt) {
566 case UBIFS_SIMPLE_KEY_FMT:
567 c->key_len = UBIFS_SK_LEN;
568 break;
569 default:
570 ubifs_err("unsupported key format");
571 err = -EINVAL;
572 goto out;
573 }
574
575 c->leb_cnt = le32_to_cpu(sup->leb_cnt);
576 c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt);
577 c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
578 c->log_lebs = le32_to_cpu(sup->log_lebs);
579 c->lpt_lebs = le32_to_cpu(sup->lpt_lebs);
580 c->orph_lebs = le32_to_cpu(sup->orph_lebs);
581 c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
582 c->fanout = le32_to_cpu(sup->fanout);
583 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
584 c->default_compr = le16_to_cpu(sup->default_compr);
585 c->rp_size = le64_to_cpu(sup->rp_size);
586 c->rp_uid = le32_to_cpu(sup->rp_uid);
587 c->rp_gid = le32_to_cpu(sup->rp_gid);
588 sup_flags = le32_to_cpu(sup->flags);
589
590 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
591
592 memcpy(&c->uuid, &sup->uuid, 16);
593
594 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
595
596 /* Automatically increase file system size to the maximum size */
597 c->old_leb_cnt = c->leb_cnt;
598 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
599 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
600 if (c->vfs_sb->s_flags & MS_RDONLY)
601 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
602 c->old_leb_cnt, c->leb_cnt);
603 else {
604 dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
605 c->old_leb_cnt, c->leb_cnt);
606 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
607 err = ubifs_write_sb_node(c, sup);
608 if (err)
609 goto out;
610 c->old_leb_cnt = c->leb_cnt;
611 }
612 }
613
614 c->log_bytes = (long long)c->log_lebs * c->leb_size;
615 c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
616 c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
617 c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
618 c->orph_first = c->lpt_last + 1;
619 c->orph_last = c->orph_first + c->orph_lebs - 1;
620 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
621 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
622 c->main_first = c->leb_cnt - c->main_lebs;
623 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
624
625 err = validate_sb(c, sup);
626out:
627 kfree(sup);
628 return err;
629}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
new file mode 100644
index 000000000000..acf5c5fffc60
--- /dev/null
+++ b/fs/ubifs/scan.c
@@ -0,0 +1,362 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the scan which is a general-purpose function for
25 * determining what nodes are in an eraseblock. The scan is used to replay the
26 * journal, to do garbage collection. for the TNC in-the-gaps method, and by
27 * debugging functions.
28 */
29
30#include "ubifs.h"
31
32/**
33 * scan_padding_bytes - scan for padding bytes.
34 * @buf: buffer to scan
35 * @len: length of buffer
36 *
37 * This function returns the number of padding bytes on success and
38 * %SCANNED_GARBAGE on failure.
39 */
40static int scan_padding_bytes(void *buf, int len)
41{
42 int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
43 uint8_t *p = buf;
44
45 dbg_scan("not a node");
46
47 while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
48 pad_len += 1;
49
50 if (!pad_len || (pad_len & 7))
51 return SCANNED_GARBAGE;
52
53 dbg_scan("%d padding bytes", pad_len);
54
55 return pad_len;
56}
57
58/**
59 * ubifs_scan_a_node - scan for a node or padding.
60 * @c: UBIFS file-system description object
61 * @buf: buffer to scan
62 * @len: length of buffer
63 * @lnum: logical eraseblock number
64 * @offs: offset within the logical eraseblock
65 * @quiet: print no messages
66 *
67 * This function returns a scanning code to indicate what was scanned.
68 */
69int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
70 int offs, int quiet)
71{
72 struct ubifs_ch *ch = buf;
73 uint32_t magic;
74
75 magic = le32_to_cpu(ch->magic);
76
77 if (magic == 0xFFFFFFFF) {
78 dbg_scan("hit empty space");
79 return SCANNED_EMPTY_SPACE;
80 }
81
82 if (magic != UBIFS_NODE_MAGIC)
83 return scan_padding_bytes(buf, len);
84
85 if (len < UBIFS_CH_SZ)
86 return SCANNED_GARBAGE;
87
88 dbg_scan("scanning %s", dbg_ntype(ch->node_type));
89
90 if (ubifs_check_node(c, buf, lnum, offs, quiet))
91 return SCANNED_A_CORRUPT_NODE;
92
93 if (ch->node_type == UBIFS_PAD_NODE) {
94 struct ubifs_pad_node *pad = buf;
95 int pad_len = le32_to_cpu(pad->pad_len);
96 int node_len = le32_to_cpu(ch->len);
97
98 /* Validate the padding node */
99 if (pad_len < 0 ||
100 offs + node_len + pad_len > c->leb_size) {
101 if (!quiet) {
102 ubifs_err("bad pad node at LEB %d:%d",
103 lnum, offs);
104 dbg_dump_node(c, pad);
105 }
106 return SCANNED_A_BAD_PAD_NODE;
107 }
108
109 /* Make the node pads to 8-byte boundary */
110 if ((node_len + pad_len) & 7) {
111 if (!quiet) {
112 dbg_err("bad padding length %d - %d",
113 offs, offs + node_len + pad_len);
114 }
115 return SCANNED_A_BAD_PAD_NODE;
116 }
117
118 dbg_scan("%d bytes padded, offset now %d",
119 pad_len, ALIGN(offs + node_len + pad_len, 8));
120
121 return node_len + pad_len;
122 }
123
124 return SCANNED_A_NODE;
125}
126
127/**
128 * ubifs_start_scan - create LEB scanning information at start of scan.
129 * @c: UBIFS file-system description object
130 * @lnum: logical eraseblock number
131 * @offs: offset to start at (usually zero)
132 * @sbuf: scan buffer (must be c->leb_size)
133 *
134 * This function returns %0 on success and a negative error code on failure.
135 */
136struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
137 int offs, void *sbuf)
138{
139 struct ubifs_scan_leb *sleb;
140 int err;
141
142 dbg_scan("scan LEB %d:%d", lnum, offs);
143
144 sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
145 if (!sleb)
146 return ERR_PTR(-ENOMEM);
147
148 sleb->lnum = lnum;
149 INIT_LIST_HEAD(&sleb->nodes);
150 sleb->buf = sbuf;
151
152 err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
153 if (err && err != -EBADMSG) {
154 ubifs_err("cannot read %d bytes from LEB %d:%d,"
155 " error %d", c->leb_size - offs, lnum, offs, err);
156 kfree(sleb);
157 return ERR_PTR(err);
158 }
159
160 if (err == -EBADMSG)
161 sleb->ecc = 1;
162
163 return sleb;
164}
165
166/**
167 * ubifs_end_scan - update LEB scanning information at end of scan.
168 * @c: UBIFS file-system description object
169 * @sleb: scanning information
170 * @lnum: logical eraseblock number
171 * @offs: offset to start at (usually zero)
172 *
173 * This function returns %0 on success and a negative error code on failure.
174 */
175void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
176 int lnum, int offs)
177{
178 lnum = lnum;
179 dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
180 ubifs_assert(offs % c->min_io_size == 0);
181
182 sleb->endpt = ALIGN(offs, c->min_io_size);
183}
184
185/**
186 * ubifs_add_snod - add a scanned node to LEB scanning information.
187 * @c: UBIFS file-system description object
188 * @sleb: scanning information
189 * @buf: buffer containing node
190 * @offs: offset of node on flash
191 *
192 * This function returns %0 on success and a negative error code on failure.
193 */
194int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
195 void *buf, int offs)
196{
197 struct ubifs_ch *ch = buf;
198 struct ubifs_ino_node *ino = buf;
199 struct ubifs_scan_node *snod;
200
201 snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
202 if (!snod)
203 return -ENOMEM;
204
205 snod->sqnum = le64_to_cpu(ch->sqnum);
206 snod->type = ch->node_type;
207 snod->offs = offs;
208 snod->len = le32_to_cpu(ch->len);
209 snod->node = buf;
210
211 switch (ch->node_type) {
212 case UBIFS_INO_NODE:
213 case UBIFS_DENT_NODE:
214 case UBIFS_XENT_NODE:
215 case UBIFS_DATA_NODE:
216 case UBIFS_TRUN_NODE:
217 /*
218 * The key is in the same place in all keyed
219 * nodes.
220 */
221 key_read(c, &ino->key, &snod->key);
222 break;
223 }
224 list_add_tail(&snod->list, &sleb->nodes);
225 sleb->nodes_cnt += 1;
226 return 0;
227}
228
229/**
230 * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
231 * @c: UBIFS file-system description object
232 * @lnum: LEB number of corruption
233 * @offs: offset of corruption
234 * @buf: buffer containing corruption
235 */
236void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
237 void *buf)
238{
239 int len;
240
241 ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
242 if (dbg_failure_mode)
243 return;
244 len = c->leb_size - offs;
245 if (len > 4096)
246 len = 4096;
247 dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
248 print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
249}
250
251/**
252 * ubifs_scan - scan a logical eraseblock.
253 * @c: UBIFS file-system description object
254 * @lnum: logical eraseblock number
255 * @offs: offset to start at (usually zero)
256 * @sbuf: scan buffer (must be c->leb_size)
257 *
258 * This function scans LEB number @lnum and returns complete information about
259 * its contents. Returns an error code in case of failure.
260 */
261struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
262 int offs, void *sbuf)
263{
264 void *buf = sbuf + offs;
265 int err, len = c->leb_size - offs;
266 struct ubifs_scan_leb *sleb;
267
268 sleb = ubifs_start_scan(c, lnum, offs, sbuf);
269 if (IS_ERR(sleb))
270 return sleb;
271
272 while (len >= 8) {
273 struct ubifs_ch *ch = buf;
274 int node_len, ret;
275
276 dbg_scan("look at LEB %d:%d (%d bytes left)",
277 lnum, offs, len);
278
279 cond_resched();
280
281 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
282
283 if (ret > 0) {
284 /* Padding bytes or a valid padding node */
285 offs += ret;
286 buf += ret;
287 len -= ret;
288 continue;
289 }
290
291 if (ret == SCANNED_EMPTY_SPACE)
292 /* Empty space is checked later */
293 break;
294
295 switch (ret) {
296 case SCANNED_GARBAGE:
297 dbg_err("garbage");
298 goto corrupted;
299 case SCANNED_A_NODE:
300 break;
301 case SCANNED_A_CORRUPT_NODE:
302 case SCANNED_A_BAD_PAD_NODE:
303 dbg_err("bad node");
304 goto corrupted;
305 default:
306 dbg_err("unknown");
307 goto corrupted;
308 }
309
310 err = ubifs_add_snod(c, sleb, buf, offs);
311 if (err)
312 goto error;
313
314 node_len = ALIGN(le32_to_cpu(ch->len), 8);
315 offs += node_len;
316 buf += node_len;
317 len -= node_len;
318 }
319
320 if (offs % c->min_io_size)
321 goto corrupted;
322
323 ubifs_end_scan(c, sleb, lnum, offs);
324
325 for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
326 if (*(uint32_t *)buf != 0xffffffff)
327 break;
328 for (; len; offs++, buf++, len--)
329 if (*(uint8_t *)buf != 0xff) {
330 ubifs_err("corrupt empty space at LEB %d:%d",
331 lnum, offs);
332 goto corrupted;
333 }
334
335 return sleb;
336
337corrupted:
338 ubifs_scanned_corruption(c, lnum, offs, buf);
339 err = -EUCLEAN;
340error:
341 ubifs_err("LEB %d scanning failed", lnum);
342 ubifs_scan_destroy(sleb);
343 return ERR_PTR(err);
344}
345
346/**
347 * ubifs_scan_destroy - destroy LEB scanning information.
348 * @sleb: scanning information to free
349 */
350void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
351{
352 struct ubifs_scan_node *node;
353 struct list_head *head;
354
355 head = &sleb->nodes;
356 while (!list_empty(head)) {
357 node = list_entry(head->next, struct ubifs_scan_node, list);
358 list_del(&node->list);
359 kfree(node);
360 }
361 kfree(sleb);
362}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
new file mode 100644
index 000000000000..f248533841a2
--- /dev/null
+++ b/fs/ubifs/shrinker.c
@@ -0,0 +1,322 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS shrinker which evicts clean znodes from the TNC
25 * tree when Linux VM needs more RAM.
26 *
27 * We do not implement any LRU lists to find oldest znodes to free because it
28 * would add additional overhead to the file system fast paths. So the shrinker
29 * just walks the TNC tree when searching for znodes to free.
30 *
31 * If the root of a TNC sub-tree is clean and old enough, then the children are
32 * also clean and old enough. So the shrinker walks the TNC in level order and
33 * dumps entire sub-trees.
34 *
35 * The age of znodes is just the time-stamp when they were last looked at.
36 * The current shrinker first tries to evict old znodes, then young ones.
37 *
38 * Since the shrinker is global, it has to protect against races with FS
39 * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
40 */
41
42#include "ubifs.h"
43
44/* List of all UBIFS file-system instances */
45LIST_HEAD(ubifs_infos);
46
47/*
48 * We number each shrinker run and record the number on the ubifs_info structure
49 * so that we can easily work out which ubifs_info structures have already been
50 * done by the current run.
51 */
52static unsigned int shrinker_run_no;
53
54/* Protects 'ubifs_infos' list */
55DEFINE_SPINLOCK(ubifs_infos_lock);
56
57/* Global clean znode counter (for all mounted UBIFS instances) */
58atomic_long_t ubifs_clean_zn_cnt;
59
60/**
61 * shrink_tnc - shrink TNC tree.
62 * @c: UBIFS file-system description object
63 * @nr: number of znodes to free
64 * @age: the age of znodes to free
65 * @contention: if any contention, this is set to %1
66 *
67 * This function traverses TNC tree and frees clean znodes. It does not free
68 * clean znodes which younger then @age. Returns number of freed znodes.
69 */
70static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
71{
72 int total_freed = 0;
73 struct ubifs_znode *znode, *zprev;
74 int time = get_seconds();
75
76 ubifs_assert(mutex_is_locked(&c->umount_mutex));
77 ubifs_assert(mutex_is_locked(&c->tnc_mutex));
78
79 if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
80 return 0;
81
82 /*
83 * Traverse the TNC tree in levelorder manner, so that it is possible
84 * to destroy large sub-trees. Indeed, if a znode is old, then all its
85 * children are older or of the same age.
86 *
87 * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
88 * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
89 * changed only when the 'c->tnc_mutex' is held.
90 */
91 zprev = NULL;
92 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
93 while (znode && total_freed < nr &&
94 atomic_long_read(&c->clean_zn_cnt) > 0) {
95 int freed;
96
97 /*
98 * If the znode is clean, but it is in the 'c->cnext' list, this
99 * means that this znode has just been written to flash as a
100 * part of commit and was marked clean. They will be removed
101 * from the list at end commit. We cannot change the list,
102 * because it is not protected by any mutex (design decision to
103 * make commit really independent and parallel to main I/O). So
104 * we just skip these znodes.
105 *
106 * Note, the 'clean_zn_cnt' counters are not updated until
107 * after the commit, so the UBIFS shrinker does not report
108 * the znodes which are in the 'c->cnext' list as freeable.
109 *
110 * Also note, if the root of a sub-tree is not in 'c->cnext',
111 * then the whole sub-tree is not in 'c->cnext' as well, so it
112 * is safe to dump whole sub-tree.
113 */
114
115 if (znode->cnext) {
116 /*
117 * Very soon these znodes will be removed from the list
118 * and become freeable.
119 */
120 *contention = 1;
121 } else if (!ubifs_zn_dirty(znode) &&
122 abs(time - znode->time) >= age) {
123 if (znode->parent)
124 znode->parent->zbranch[znode->iip].znode = NULL;
125 else
126 c->zroot.znode = NULL;
127
128 freed = ubifs_destroy_tnc_subtree(znode);
129 atomic_long_sub(freed, &ubifs_clean_zn_cnt);
130 atomic_long_sub(freed, &c->clean_zn_cnt);
131 ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
132 total_freed += freed;
133 znode = zprev;
134 }
135
136 if (unlikely(!c->zroot.znode))
137 break;
138
139 zprev = znode;
140 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
141 cond_resched();
142 }
143
144 return total_freed;
145}
146
147/**
148 * shrink_tnc_trees - shrink UBIFS TNC trees.
149 * @nr: number of znodes to free
150 * @age: the age of znodes to free
151 * @contention: if any contention, this is set to %1
152 *
153 * This function walks the list of mounted UBIFS file-systems and frees clean
154 * znodes which are older then @age, until at least @nr znodes are freed.
155 * Returns the number of freed znodes.
156 */
157static int shrink_tnc_trees(int nr, int age, int *contention)
158{
159 struct ubifs_info *c;
160 struct list_head *p;
161 unsigned int run_no;
162 int freed = 0;
163
164 spin_lock(&ubifs_infos_lock);
165 do {
166 run_no = ++shrinker_run_no;
167 } while (run_no == 0);
168 /* Iterate over all mounted UBIFS file-systems and try to shrink them */
169 p = ubifs_infos.next;
170 while (p != &ubifs_infos) {
171 c = list_entry(p, struct ubifs_info, infos_list);
172 /*
173 * We move the ones we do to the end of the list, so we stop
174 * when we see one we have already done.
175 */
176 if (c->shrinker_run_no == run_no)
177 break;
178 if (!mutex_trylock(&c->umount_mutex)) {
179 /* Some un-mount is in progress, try next FS */
180 *contention = 1;
181 p = p->next;
182 continue;
183 }
184 /*
185 * We're holding 'c->umount_mutex', so the file-system won't go
186 * away.
187 */
188 if (!mutex_trylock(&c->tnc_mutex)) {
189 mutex_unlock(&c->umount_mutex);
190 *contention = 1;
191 p = p->next;
192 continue;
193 }
194 spin_unlock(&ubifs_infos_lock);
195 /*
196 * OK, now we have TNC locked, the file-system cannot go away -
197 * it is safe to reap the cache.
198 */
199 c->shrinker_run_no = run_no;
200 freed += shrink_tnc(c, nr, age, contention);
201 mutex_unlock(&c->tnc_mutex);
202 spin_lock(&ubifs_infos_lock);
203 /* Get the next list element before we move this one */
204 p = p->next;
205 /*
206 * Move this one to the end of the list to provide some
207 * fairness.
208 */
209 list_del(&c->infos_list);
210 list_add_tail(&c->infos_list, &ubifs_infos);
211 mutex_unlock(&c->umount_mutex);
212 if (freed >= nr)
213 break;
214 }
215 spin_unlock(&ubifs_infos_lock);
216 return freed;
217}
218
219/**
220 * kick_a_thread - kick a background thread to start commit.
221 *
222 * This function kicks a background thread to start background commit. Returns
223 * %-1 if a thread was kicked or there is another reason to assume the memory
224 * will soon be freed or become freeable. If there are no dirty znodes, returns
225 * %0.
226 */
227static int kick_a_thread(void)
228{
229 int i;
230 struct ubifs_info *c;
231
232 /*
233 * Iterate over all mounted UBIFS file-systems and find out if there is
234 * already an ongoing commit operation there. If no, then iterate for
235 * the second time and initiate background commit.
236 */
237 spin_lock(&ubifs_infos_lock);
238 for (i = 0; i < 2; i++) {
239 list_for_each_entry(c, &ubifs_infos, infos_list) {
240 long dirty_zn_cnt;
241
242 if (!mutex_trylock(&c->umount_mutex)) {
243 /*
244 * Some un-mount is in progress, it will
245 * certainly free memory, so just return.
246 */
247 spin_unlock(&ubifs_infos_lock);
248 return -1;
249 }
250
251 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
252
253 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
254 c->ro_media) {
255 mutex_unlock(&c->umount_mutex);
256 continue;
257 }
258
259 if (c->cmt_state != COMMIT_RESTING) {
260 spin_unlock(&ubifs_infos_lock);
261 mutex_unlock(&c->umount_mutex);
262 return -1;
263 }
264
265 if (i == 1) {
266 list_del(&c->infos_list);
267 list_add_tail(&c->infos_list, &ubifs_infos);
268 spin_unlock(&ubifs_infos_lock);
269
270 ubifs_request_bg_commit(c);
271 mutex_unlock(&c->umount_mutex);
272 return -1;
273 }
274 mutex_unlock(&c->umount_mutex);
275 }
276 }
277 spin_unlock(&ubifs_infos_lock);
278
279 return 0;
280}
281
282int ubifs_shrinker(int nr, gfp_t gfp_mask)
283{
284 int freed, contention = 0;
285 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
286
287 if (nr == 0)
288 return clean_zn_cnt;
289
290 if (!clean_zn_cnt) {
291 /*
292 * No clean znodes, nothing to reap. All we can do in this case
293 * is to kick background threads to start commit, which will
294 * probably make clean znodes which, in turn, will be freeable.
295 * And we return -1 which means will make VM call us again
296 * later.
297 */
298 dbg_tnc("no clean znodes, kick a thread");
299 return kick_a_thread();
300 }
301
302 freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
303 if (freed >= nr)
304 goto out;
305
306 dbg_tnc("not enough old znodes, try to free young ones");
307 freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
308 if (freed >= nr)
309 goto out;
310
311 dbg_tnc("not enough young znodes, free all");
312 freed += shrink_tnc_trees(nr - freed, 0, &contention);
313
314 if (!freed && contention) {
315 dbg_tnc("freed nothing, but contention");
316 return -1;
317 }
318
319out:
320 dbg_tnc("%d znodes were freed, requested %d", freed, nr);
321 return freed;
322}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
new file mode 100644
index 000000000000..00eb9c68ad03
--- /dev/null
+++ b/fs/ubifs/super.c
@@ -0,0 +1,1951 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS initialization and VFS superblock operations. Some
25 * initialization stuff which is rather large and complex is placed at
26 * corresponding subsystems, but most of it is here.
27 */
28
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/module.h>
32#include <linux/ctype.h>
33#include <linux/random.h>
34#include <linux/kthread.h>
35#include <linux/parser.h>
36#include <linux/seq_file.h>
37#include <linux/mount.h>
38#include "ubifs.h"
39
40/* Slab cache for UBIFS inodes */
41struct kmem_cache *ubifs_inode_slab;
42
43/* UBIFS TNC shrinker description */
44static struct shrinker ubifs_shrinker_info = {
45 .shrink = ubifs_shrinker,
46 .seeks = DEFAULT_SEEKS,
47};
48
49/**
50 * validate_inode - validate inode.
51 * @c: UBIFS file-system description object
52 * @inode: the inode to validate
53 *
54 * This is a helper function for 'ubifs_iget()' which validates various fields
55 * of a newly built inode to make sure they contain sane values and prevent
56 * possible vulnerabilities. Returns zero if the inode is all right and
57 * a non-zero error code if not.
58 */
59static int validate_inode(struct ubifs_info *c, const struct inode *inode)
60{
61 int err;
62 const struct ubifs_inode *ui = ubifs_inode(inode);
63
64 if (inode->i_size > c->max_inode_sz) {
65 ubifs_err("inode is too large (%lld)",
66 (long long)inode->i_size);
67 return 1;
68 }
69
70 if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
71 ubifs_err("unknown compression type %d", ui->compr_type);
72 return 2;
73 }
74
75 if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
76 return 3;
77
78 if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
79 return 4;
80
81 if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
82 return 5;
83
84 if (!ubifs_compr_present(ui->compr_type)) {
85 ubifs_warn("inode %lu uses '%s' compression, but it was not "
86 "compiled in", inode->i_ino,
87 ubifs_compr_name(ui->compr_type));
88 }
89
90 err = dbg_check_dir_size(c, inode);
91 return err;
92}
93
94struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
95{
96 int err;
97 union ubifs_key key;
98 struct ubifs_ino_node *ino;
99 struct ubifs_info *c = sb->s_fs_info;
100 struct inode *inode;
101 struct ubifs_inode *ui;
102
103 dbg_gen("inode %lu", inum);
104
105 inode = iget_locked(sb, inum);
106 if (!inode)
107 return ERR_PTR(-ENOMEM);
108 if (!(inode->i_state & I_NEW))
109 return inode;
110 ui = ubifs_inode(inode);
111
112 ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
113 if (!ino) {
114 err = -ENOMEM;
115 goto out;
116 }
117
118 ino_key_init(c, &key, inode->i_ino);
119
120 err = ubifs_tnc_lookup(c, &key, ino);
121 if (err)
122 goto out_ino;
123
124 inode->i_flags |= (S_NOCMTIME | S_NOATIME);
125 inode->i_nlink = le32_to_cpu(ino->nlink);
126 inode->i_uid = le32_to_cpu(ino->uid);
127 inode->i_gid = le32_to_cpu(ino->gid);
128 inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
129 inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
130 inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
131 inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
132 inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec);
133 inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
134 inode->i_mode = le32_to_cpu(ino->mode);
135 inode->i_size = le64_to_cpu(ino->size);
136
137 ui->data_len = le32_to_cpu(ino->data_len);
138 ui->flags = le32_to_cpu(ino->flags);
139 ui->compr_type = le16_to_cpu(ino->compr_type);
140 ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
141 ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
142 ui->xattr_size = le32_to_cpu(ino->xattr_size);
143 ui->xattr_names = le32_to_cpu(ino->xattr_names);
144 ui->synced_i_size = ui->ui_size = inode->i_size;
145
146 ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
147
148 err = validate_inode(c, inode);
149 if (err)
150 goto out_invalid;
151
152 /* Disable readahead */
153 inode->i_mapping->backing_dev_info = &c->bdi;
154
155 switch (inode->i_mode & S_IFMT) {
156 case S_IFREG:
157 inode->i_mapping->a_ops = &ubifs_file_address_operations;
158 inode->i_op = &ubifs_file_inode_operations;
159 inode->i_fop = &ubifs_file_operations;
160 if (ui->xattr) {
161 ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
162 if (!ui->data) {
163 err = -ENOMEM;
164 goto out_ino;
165 }
166 memcpy(ui->data, ino->data, ui->data_len);
167 ((char *)ui->data)[ui->data_len] = '\0';
168 } else if (ui->data_len != 0) {
169 err = 10;
170 goto out_invalid;
171 }
172 break;
173 case S_IFDIR:
174 inode->i_op = &ubifs_dir_inode_operations;
175 inode->i_fop = &ubifs_dir_operations;
176 if (ui->data_len != 0) {
177 err = 11;
178 goto out_invalid;
179 }
180 break;
181 case S_IFLNK:
182 inode->i_op = &ubifs_symlink_inode_operations;
183 if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
184 err = 12;
185 goto out_invalid;
186 }
187 ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
188 if (!ui->data) {
189 err = -ENOMEM;
190 goto out_ino;
191 }
192 memcpy(ui->data, ino->data, ui->data_len);
193 ((char *)ui->data)[ui->data_len] = '\0';
194 break;
195 case S_IFBLK:
196 case S_IFCHR:
197 {
198 dev_t rdev;
199 union ubifs_dev_desc *dev;
200
201 ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
202 if (!ui->data) {
203 err = -ENOMEM;
204 goto out_ino;
205 }
206
207 dev = (union ubifs_dev_desc *)ino->data;
208 if (ui->data_len == sizeof(dev->new))
209 rdev = new_decode_dev(le32_to_cpu(dev->new));
210 else if (ui->data_len == sizeof(dev->huge))
211 rdev = huge_decode_dev(le64_to_cpu(dev->huge));
212 else {
213 err = 13;
214 goto out_invalid;
215 }
216 memcpy(ui->data, ino->data, ui->data_len);
217 inode->i_op = &ubifs_file_inode_operations;
218 init_special_inode(inode, inode->i_mode, rdev);
219 break;
220 }
221 case S_IFSOCK:
222 case S_IFIFO:
223 inode->i_op = &ubifs_file_inode_operations;
224 init_special_inode(inode, inode->i_mode, 0);
225 if (ui->data_len != 0) {
226 err = 14;
227 goto out_invalid;
228 }
229 break;
230 default:
231 err = 15;
232 goto out_invalid;
233 }
234
235 kfree(ino);
236 ubifs_set_inode_flags(inode);
237 unlock_new_inode(inode);
238 return inode;
239
240out_invalid:
241 ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
242 dbg_dump_node(c, ino);
243 dbg_dump_inode(c, inode);
244 err = -EINVAL;
245out_ino:
246 kfree(ino);
247out:
248 ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
249 iget_failed(inode);
250 return ERR_PTR(err);
251}
252
253static struct inode *ubifs_alloc_inode(struct super_block *sb)
254{
255 struct ubifs_inode *ui;
256
257 ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
258 if (!ui)
259 return NULL;
260
261 memset((void *)ui + sizeof(struct inode), 0,
262 sizeof(struct ubifs_inode) - sizeof(struct inode));
263 mutex_init(&ui->ui_mutex);
264 spin_lock_init(&ui->ui_lock);
265 return &ui->vfs_inode;
266};
267
268static void ubifs_destroy_inode(struct inode *inode)
269{
270 struct ubifs_inode *ui = ubifs_inode(inode);
271
272 kfree(ui->data);
273 kmem_cache_free(ubifs_inode_slab, inode);
274}
275
276/*
277 * Note, Linux write-back code calls this without 'i_mutex'.
278 */
279static int ubifs_write_inode(struct inode *inode, int wait)
280{
281 int err;
282 struct ubifs_info *c = inode->i_sb->s_fs_info;
283 struct ubifs_inode *ui = ubifs_inode(inode);
284
285 ubifs_assert(!ui->xattr);
286 if (is_bad_inode(inode))
287 return 0;
288
289 mutex_lock(&ui->ui_mutex);
290 /*
291 * Due to races between write-back forced by budgeting
292 * (see 'sync_some_inodes()') and pdflush write-back, the inode may
293 * have already been synchronized, do not do this again. This might
294 * also happen if it was synchronized in an VFS operation, e.g.
295 * 'ubifs_link()'.
296 */
297 if (!ui->dirty) {
298 mutex_unlock(&ui->ui_mutex);
299 return 0;
300 }
301
302 dbg_gen("inode %lu", inode->i_ino);
303 err = ubifs_jnl_write_inode(c, inode, 0);
304 if (err)
305 ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
306
307 ui->dirty = 0;
308 mutex_unlock(&ui->ui_mutex);
309 ubifs_release_dirty_inode_budget(c, ui);
310 return err;
311}
312
313static void ubifs_delete_inode(struct inode *inode)
314{
315 int err;
316 struct ubifs_info *c = inode->i_sb->s_fs_info;
317
318 if (ubifs_inode(inode)->xattr)
319 /*
320 * Extended attribute inode deletions are fully handled in
321 * 'ubifs_removexattr()'. These inodes are special and have
322 * limited usage, so there is nothing to do here.
323 */
324 goto out;
325
326 dbg_gen("inode %lu", inode->i_ino);
327 ubifs_assert(!atomic_read(&inode->i_count));
328 ubifs_assert(inode->i_nlink == 0);
329
330 truncate_inode_pages(&inode->i_data, 0);
331 if (is_bad_inode(inode))
332 goto out;
333
334 ubifs_inode(inode)->ui_size = inode->i_size = 0;
335 err = ubifs_jnl_write_inode(c, inode, 1);
336 if (err)
337 /*
338 * Worst case we have a lost orphan inode wasting space, so a
339 * simple error message is ok here.
340 */
341 ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
342out:
343 clear_inode(inode);
344}
345
346static void ubifs_dirty_inode(struct inode *inode)
347{
348 struct ubifs_inode *ui = ubifs_inode(inode);
349
350 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
351 if (!ui->dirty) {
352 ui->dirty = 1;
353 dbg_gen("inode %lu", inode->i_ino);
354 }
355}
356
357static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
358{
359 struct ubifs_info *c = dentry->d_sb->s_fs_info;
360 unsigned long long free;
361
362 free = ubifs_budg_get_free_space(c);
363 dbg_gen("free space %lld bytes (%lld blocks)",
364 free, free >> UBIFS_BLOCK_SHIFT);
365
366 buf->f_type = UBIFS_SUPER_MAGIC;
367 buf->f_bsize = UBIFS_BLOCK_SIZE;
368 buf->f_blocks = c->block_cnt;
369 buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
370 if (free > c->report_rp_size)
371 buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
372 else
373 buf->f_bavail = 0;
374 buf->f_files = 0;
375 buf->f_ffree = 0;
376 buf->f_namelen = UBIFS_MAX_NLEN;
377
378 return 0;
379}
380
381static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
382{
383 struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
384
385 if (c->mount_opts.unmount_mode == 2)
386 seq_printf(s, ",fast_unmount");
387 else if (c->mount_opts.unmount_mode == 1)
388 seq_printf(s, ",norm_unmount");
389
390 return 0;
391}
392
393static int ubifs_sync_fs(struct super_block *sb, int wait)
394{
395 struct ubifs_info *c = sb->s_fs_info;
396 int i, ret = 0, err;
397
398 if (c->jheads)
399 for (i = 0; i < c->jhead_cnt; i++) {
400 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
401 if (err && !ret)
402 ret = err;
403 }
404 /*
405 * We ought to call sync for c->ubi but it does not have one. If it had
406 * it would in turn call mtd->sync, however mtd operations are
407 * synchronous anyway, so we don't lose any sleep here.
408 */
409 return ret;
410}
411
412/**
413 * init_constants_early - initialize UBIFS constants.
414 * @c: UBIFS file-system description object
415 *
416 * This function initialize UBIFS constants which do not need the superblock to
417 * be read. It also checks that the UBI volume satisfies basic UBIFS
418 * requirements. Returns zero in case of success and a negative error code in
419 * case of failure.
420 */
421static int init_constants_early(struct ubifs_info *c)
422{
423 if (c->vi.corrupted) {
424 ubifs_warn("UBI volume is corrupted - read-only mode");
425 c->ro_media = 1;
426 }
427
428 if (c->di.ro_mode) {
429 ubifs_msg("read-only UBI device");
430 c->ro_media = 1;
431 }
432
433 if (c->vi.vol_type == UBI_STATIC_VOLUME) {
434 ubifs_msg("static UBI volume - read-only mode");
435 c->ro_media = 1;
436 }
437
438 c->leb_cnt = c->vi.size;
439 c->leb_size = c->vi.usable_leb_size;
440 c->half_leb_size = c->leb_size / 2;
441 c->min_io_size = c->di.min_io_size;
442 c->min_io_shift = fls(c->min_io_size) - 1;
443
444 if (c->leb_size < UBIFS_MIN_LEB_SZ) {
445 ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
446 c->leb_size, UBIFS_MIN_LEB_SZ);
447 return -EINVAL;
448 }
449
450 if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
451 ubifs_err("too few LEBs (%d), min. is %d",
452 c->leb_cnt, UBIFS_MIN_LEB_CNT);
453 return -EINVAL;
454 }
455
456 if (!is_power_of_2(c->min_io_size)) {
457 ubifs_err("bad min. I/O size %d", c->min_io_size);
458 return -EINVAL;
459 }
460
461 /*
462 * UBIFS aligns all node to 8-byte boundary, so to make function in
463 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
464 * less than 8.
465 */
466 if (c->min_io_size < 8) {
467 c->min_io_size = 8;
468 c->min_io_shift = 3;
469 }
470
471 c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
472 c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
473
474 /*
475 * Initialize node length ranges which are mostly needed for node
476 * length validation.
477 */
478 c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ;
479 c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ;
480 c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ;
481 c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ;
482 c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
483 c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ;
484
485 c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ;
486 c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ;
487 c->ranges[UBIFS_ORPH_NODE].min_len =
488 UBIFS_ORPH_NODE_SZ + sizeof(__le64);
489 c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
490 c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
491 c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
492 c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
493 c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
494 c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
495 c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
496 /*
497 * Minimum indexing node size is amended later when superblock is
498 * read and the key length is known.
499 */
500 c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
501 /*
502 * Maximum indexing node size is amended later when superblock is
503 * read and the fanout is known.
504 */
505 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
506
507 /*
508 * Initialize dead and dark LEB space watermarks.
509 *
510 * Dead space is the space which cannot be used. Its watermark is
511 * equivalent to min. I/O unit or minimum node size if it is greater
512 * then min. I/O unit.
513 *
514 * Dark space is the space which might be used, or might not, depending
515 * on which node should be written to the LEB. Its watermark is
516 * equivalent to maximum UBIFS node size.
517 */
518 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
519 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
520
521 return 0;
522}
523
524/**
525 * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
526 * @c: UBIFS file-system description object
527 * @lnum: LEB the write-buffer was synchronized to
528 * @free: how many free bytes left in this LEB
529 * @pad: how many bytes were padded
530 *
531 * This is a callback function which is called by the I/O unit when the
532 * write-buffer is synchronized. We need this to correctly maintain space
533 * accounting in bud logical eraseblocks. This function returns zero in case of
534 * success and a negative error code in case of failure.
535 *
536 * This function actually belongs to the journal, but we keep it here because
537 * we want to keep it static.
538 */
539static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
540{
541 return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
542}
543
544/*
545 * init_constants_late - initialize UBIFS constants.
546 * @c: UBIFS file-system description object
547 *
548 * This is a helper function which initializes various UBIFS constants after
549 * the superblock has been read. It also checks various UBIFS parameters and
550 * makes sure they are all right. Returns zero in case of success and a
551 * negative error code in case of failure.
552 */
553static int init_constants_late(struct ubifs_info *c)
554{
555 int tmp, err;
556 uint64_t tmp64;
557
558 c->main_bytes = (long long)c->main_lebs * c->leb_size;
559 c->max_znode_sz = sizeof(struct ubifs_znode) +
560 c->fanout * sizeof(struct ubifs_zbranch);
561
562 tmp = ubifs_idx_node_sz(c, 1);
563 c->ranges[UBIFS_IDX_NODE].min_len = tmp;
564 c->min_idx_node_sz = ALIGN(tmp, 8);
565
566 tmp = ubifs_idx_node_sz(c, c->fanout);
567 c->ranges[UBIFS_IDX_NODE].max_len = tmp;
568 c->max_idx_node_sz = ALIGN(tmp, 8);
569
570 /* Make sure LEB size is large enough to fit full commit */
571 tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
572 tmp = ALIGN(tmp, c->min_io_size);
573 if (tmp > c->leb_size) {
574 dbg_err("too small LEB size %d, at least %d needed",
575 c->leb_size, tmp);
576 return -EINVAL;
577 }
578
579 /*
580 * Make sure that the log is large enough to fit reference nodes for
581 * all buds plus one reserved LEB.
582 */
583 tmp64 = c->max_bud_bytes;
584 tmp = do_div(tmp64, c->leb_size);
585 c->max_bud_cnt = tmp64 + !!tmp;
586 tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
587 tmp /= c->leb_size;
588 tmp += 1;
589 if (c->log_lebs < tmp) {
590 dbg_err("too small log %d LEBs, required min. %d LEBs",
591 c->log_lebs, tmp);
592 return -EINVAL;
593 }
594
595 /*
596 * When budgeting we assume worst-case scenarios when the pages are not
597 * be compressed and direntries are of the maximum size.
598 *
599 * Note, data, which may be stored in inodes is budgeted separately, so
600 * it is not included into 'c->inode_budget'.
601 */
602 c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
603 c->inode_budget = UBIFS_INO_NODE_SZ;
604 c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
605
606 /*
607 * When the amount of flash space used by buds becomes
608 * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
609 * The writers are unblocked when the commit is finished. To avoid
610 * writers to be blocked UBIFS initiates background commit in advance,
611 * when number of bud bytes becomes above the limit defined below.
612 */
613 c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
614
615 /*
616 * Ensure minimum journal size. All the bytes in the journal heads are
617 * considered to be used, when calculating the current journal usage.
618 * Consequently, if the journal is too small, UBIFS will treat it as
619 * always full.
620 */
621 tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
622 if (c->bg_bud_bytes < tmp64)
623 c->bg_bud_bytes = tmp64;
624 if (c->max_bud_bytes < tmp64 + c->leb_size)
625 c->max_bud_bytes = tmp64 + c->leb_size;
626
627 err = ubifs_calc_lpt_geom(c);
628 if (err)
629 return err;
630
631 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
632
633 /*
634 * Calculate total amount of FS blocks. This number is not used
635 * internally because it does not make much sense for UBIFS, but it is
636 * necessary to report something for the 'statfs()' call.
637 *
638 * Subtract the LEB reserved for GC and the LEB which is reserved for
639 * deletions.
640 *
641 * Review 'ubifs_calc_available()' if changing this calculation.
642 */
643 tmp64 = c->main_lebs - 2;
644 tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
645 tmp64 = ubifs_reported_space(c, tmp64);
646 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
647
648 return 0;
649}
650
651/**
652 * take_gc_lnum - reserve GC LEB.
653 * @c: UBIFS file-system description object
654 *
655 * This function ensures that the LEB reserved for garbage collection is
656 * unmapped and is marked as "taken" in lprops. We also have to set free space
657 * to LEB size and dirty space to zero, because lprops may contain out-of-date
658 * information if the file-system was un-mounted before it has been committed.
659 * This function returns zero in case of success and a negative error code in
660 * case of failure.
661 */
662static int take_gc_lnum(struct ubifs_info *c)
663{
664 int err;
665
666 if (c->gc_lnum == -1) {
667 ubifs_err("no LEB for GC");
668 return -EINVAL;
669 }
670
671 err = ubifs_leb_unmap(c, c->gc_lnum);
672 if (err)
673 return err;
674
675 /* And we have to tell lprops that this LEB is taken */
676 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
677 LPROPS_TAKEN, 0, 0);
678 return err;
679}
680
681/**
682 * alloc_wbufs - allocate write-buffers.
683 * @c: UBIFS file-system description object
684 *
685 * This helper function allocates and initializes UBIFS write-buffers. Returns
686 * zero in case of success and %-ENOMEM in case of failure.
687 */
688static int alloc_wbufs(struct ubifs_info *c)
689{
690 int i, err;
691
692 c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
693 GFP_KERNEL);
694 if (!c->jheads)
695 return -ENOMEM;
696
697 /* Initialize journal heads */
698 for (i = 0; i < c->jhead_cnt; i++) {
699 INIT_LIST_HEAD(&c->jheads[i].buds_list);
700 err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
701 if (err)
702 return err;
703
704 c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
705 c->jheads[i].wbuf.jhead = i;
706 }
707
708 c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
709 /*
710 * Garbage Collector head likely contains long-term data and
711 * does not need to be synchronized by timer.
712 */
713 c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
714 c->jheads[GCHD].wbuf.timeout = 0;
715
716 return 0;
717}
718
719/**
720 * free_wbufs - free write-buffers.
721 * @c: UBIFS file-system description object
722 */
723static void free_wbufs(struct ubifs_info *c)
724{
725 int i;
726
727 if (c->jheads) {
728 for (i = 0; i < c->jhead_cnt; i++) {
729 kfree(c->jheads[i].wbuf.buf);
730 kfree(c->jheads[i].wbuf.inodes);
731 }
732 kfree(c->jheads);
733 c->jheads = NULL;
734 }
735}
736
737/**
738 * free_orphans - free orphans.
739 * @c: UBIFS file-system description object
740 */
741static void free_orphans(struct ubifs_info *c)
742{
743 struct ubifs_orphan *orph;
744
745 while (c->orph_dnext) {
746 orph = c->orph_dnext;
747 c->orph_dnext = orph->dnext;
748 list_del(&orph->list);
749 kfree(orph);
750 }
751
752 while (!list_empty(&c->orph_list)) {
753 orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
754 list_del(&orph->list);
755 kfree(orph);
756 dbg_err("orphan list not empty at unmount");
757 }
758
759 vfree(c->orph_buf);
760 c->orph_buf = NULL;
761}
762
763/**
764 * free_buds - free per-bud objects.
765 * @c: UBIFS file-system description object
766 */
767static void free_buds(struct ubifs_info *c)
768{
769 struct rb_node *this = c->buds.rb_node;
770 struct ubifs_bud *bud;
771
772 while (this) {
773 if (this->rb_left)
774 this = this->rb_left;
775 else if (this->rb_right)
776 this = this->rb_right;
777 else {
778 bud = rb_entry(this, struct ubifs_bud, rb);
779 this = rb_parent(this);
780 if (this) {
781 if (this->rb_left == &bud->rb)
782 this->rb_left = NULL;
783 else
784 this->rb_right = NULL;
785 }
786 kfree(bud);
787 }
788 }
789}
790
791/**
792 * check_volume_empty - check if the UBI volume is empty.
793 * @c: UBIFS file-system description object
794 *
795 * This function checks if the UBIFS volume is empty by looking if its LEBs are
796 * mapped or not. The result of checking is stored in the @c->empty variable.
797 * Returns zero in case of success and a negative error code in case of
798 * failure.
799 */
800static int check_volume_empty(struct ubifs_info *c)
801{
802 int lnum, err;
803
804 c->empty = 1;
805 for (lnum = 0; lnum < c->leb_cnt; lnum++) {
806 err = ubi_is_mapped(c->ubi, lnum);
807 if (unlikely(err < 0))
808 return err;
809 if (err == 1) {
810 c->empty = 0;
811 break;
812 }
813
814 cond_resched();
815 }
816
817 return 0;
818}
819
820/*
821 * UBIFS mount options.
822 *
823 * Opt_fast_unmount: do not run a journal commit before un-mounting
824 * Opt_norm_unmount: run a journal commit before un-mounting
825 * Opt_err: just end of array marker
826 */
827enum {
828 Opt_fast_unmount,
829 Opt_norm_unmount,
830 Opt_err,
831};
832
833static match_table_t tokens = {
834 {Opt_fast_unmount, "fast_unmount"},
835 {Opt_norm_unmount, "norm_unmount"},
836 {Opt_err, NULL},
837};
838
839/**
840 * ubifs_parse_options - parse mount parameters.
841 * @c: UBIFS file-system description object
842 * @options: parameters to parse
843 * @is_remount: non-zero if this is FS re-mount
844 *
845 * This function parses UBIFS mount options and returns zero in case success
846 * and a negative error code in case of failure.
847 */
848static int ubifs_parse_options(struct ubifs_info *c, char *options,
849 int is_remount)
850{
851 char *p;
852 substring_t args[MAX_OPT_ARGS];
853
854 if (!options)
855 return 0;
856
857 while ((p = strsep(&options, ","))) {
858 int token;
859
860 if (!*p)
861 continue;
862
863 token = match_token(p, tokens, args);
864 switch (token) {
865 case Opt_fast_unmount:
866 c->mount_opts.unmount_mode = 2;
867 c->fast_unmount = 1;
868 break;
869 case Opt_norm_unmount:
870 c->mount_opts.unmount_mode = 1;
871 c->fast_unmount = 0;
872 break;
873 default:
874 ubifs_err("unrecognized mount option \"%s\" "
875 "or missing value", p);
876 return -EINVAL;
877 }
878 }
879
880 return 0;
881}
882
883/**
884 * destroy_journal - destroy journal data structures.
885 * @c: UBIFS file-system description object
886 *
887 * This function destroys journal data structures including those that may have
888 * been created by recovery functions.
889 */
890static void destroy_journal(struct ubifs_info *c)
891{
892 while (!list_empty(&c->unclean_leb_list)) {
893 struct ubifs_unclean_leb *ucleb;
894
895 ucleb = list_entry(c->unclean_leb_list.next,
896 struct ubifs_unclean_leb, list);
897 list_del(&ucleb->list);
898 kfree(ucleb);
899 }
900 while (!list_empty(&c->old_buds)) {
901 struct ubifs_bud *bud;
902
903 bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
904 list_del(&bud->list);
905 kfree(bud);
906 }
907 ubifs_destroy_idx_gc(c);
908 ubifs_destroy_size_tree(c);
909 ubifs_tnc_close(c);
910 free_buds(c);
911}
912
913/**
914 * mount_ubifs - mount UBIFS file-system.
915 * @c: UBIFS file-system description object
916 *
917 * This function mounts UBIFS file system. Returns zero in case of success and
918 * a negative error code in case of failure.
919 *
920 * Note, the function does not de-allocate resources it it fails half way
921 * through, and the caller has to do this instead.
922 */
923static int mount_ubifs(struct ubifs_info *c)
924{
925 struct super_block *sb = c->vfs_sb;
926 int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
927 long long x;
928 size_t sz;
929
930 err = init_constants_early(c);
931 if (err)
932 return err;
933
934#ifdef CONFIG_UBIFS_FS_DEBUG
935 c->dbg_buf = vmalloc(c->leb_size);
936 if (!c->dbg_buf)
937 return -ENOMEM;
938#endif
939
940 err = check_volume_empty(c);
941 if (err)
942 goto out_free;
943
944 if (c->empty && (mounted_read_only || c->ro_media)) {
945 /*
946 * This UBI volume is empty, and read-only, or the file system
947 * is mounted read-only - we cannot format it.
948 */
949 ubifs_err("can't format empty UBI volume: read-only %s",
950 c->ro_media ? "UBI volume" : "mount");
951 err = -EROFS;
952 goto out_free;
953 }
954
955 if (c->ro_media && !mounted_read_only) {
956 ubifs_err("cannot mount read-write - read-only media");
957 err = -EROFS;
958 goto out_free;
959 }
960
961 /*
962 * The requirement for the buffer is that it should fit indexing B-tree
963 * height amount of integers. We assume the height if the TNC tree will
964 * never exceed 64.
965 */
966 err = -ENOMEM;
967 c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
968 if (!c->bottom_up_buf)
969 goto out_free;
970
971 c->sbuf = vmalloc(c->leb_size);
972 if (!c->sbuf)
973 goto out_free;
974
975 if (!mounted_read_only) {
976 c->ileb_buf = vmalloc(c->leb_size);
977 if (!c->ileb_buf)
978 goto out_free;
979 }
980
981 err = ubifs_read_superblock(c);
982 if (err)
983 goto out_free;
984
985 /*
986 * Make sure the compressor which is set as the default on in the
987 * superblock was actually compiled in.
988 */
989 if (!ubifs_compr_present(c->default_compr)) {
990 ubifs_warn("'%s' compressor is set by superblock, but not "
991 "compiled in", ubifs_compr_name(c->default_compr));
992 c->default_compr = UBIFS_COMPR_NONE;
993 }
994
995 dbg_failure_mode_registration(c);
996
997 err = init_constants_late(c);
998 if (err)
999 goto out_dereg;
1000
1001 sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
1002 sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
1003 c->cbuf = kmalloc(sz, GFP_NOFS);
1004 if (!c->cbuf) {
1005 err = -ENOMEM;
1006 goto out_dereg;
1007 }
1008
1009 if (!mounted_read_only) {
1010 err = alloc_wbufs(c);
1011 if (err)
1012 goto out_cbuf;
1013
1014 /* Create background thread */
1015 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
1016 c->vi.vol_id);
1017 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
1018 if (!c->bgt)
1019 c->bgt = ERR_PTR(-EINVAL);
1020 if (IS_ERR(c->bgt)) {
1021 err = PTR_ERR(c->bgt);
1022 c->bgt = NULL;
1023 ubifs_err("cannot spawn \"%s\", error %d",
1024 c->bgt_name, err);
1025 goto out_wbufs;
1026 }
1027 wake_up_process(c->bgt);
1028 }
1029
1030 err = ubifs_read_master(c);
1031 if (err)
1032 goto out_master;
1033
1034 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1035 ubifs_msg("recovery needed");
1036 c->need_recovery = 1;
1037 if (!mounted_read_only) {
1038 err = ubifs_recover_inl_heads(c, c->sbuf);
1039 if (err)
1040 goto out_master;
1041 }
1042 } else if (!mounted_read_only) {
1043 /*
1044 * Set the "dirty" flag so that if we reboot uncleanly we
1045 * will notice this immediately on the next mount.
1046 */
1047 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
1048 err = ubifs_write_master(c);
1049 if (err)
1050 goto out_master;
1051 }
1052
1053 err = ubifs_lpt_init(c, 1, !mounted_read_only);
1054 if (err)
1055 goto out_lpt;
1056
1057 err = dbg_check_idx_size(c, c->old_idx_sz);
1058 if (err)
1059 goto out_lpt;
1060
1061 err = ubifs_replay_journal(c);
1062 if (err)
1063 goto out_journal;
1064
1065 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
1066 if (err)
1067 goto out_orphans;
1068
1069 if (!mounted_read_only) {
1070 int lnum;
1071
1072 /* Check for enough free space */
1073 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
1074 ubifs_err("insufficient available space");
1075 err = -EINVAL;
1076 goto out_orphans;
1077 }
1078
1079 /* Check for enough log space */
1080 lnum = c->lhead_lnum + 1;
1081 if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
1082 lnum = UBIFS_LOG_LNUM;
1083 if (lnum == c->ltail_lnum) {
1084 err = ubifs_consolidate_log(c);
1085 if (err)
1086 goto out_orphans;
1087 }
1088
1089 if (c->need_recovery) {
1090 err = ubifs_recover_size(c);
1091 if (err)
1092 goto out_orphans;
1093 err = ubifs_rcvry_gc_commit(c);
1094 } else
1095 err = take_gc_lnum(c);
1096 if (err)
1097 goto out_orphans;
1098
1099 err = dbg_check_lprops(c);
1100 if (err)
1101 goto out_orphans;
1102 } else if (c->need_recovery) {
1103 err = ubifs_recover_size(c);
1104 if (err)
1105 goto out_orphans;
1106 }
1107
1108 spin_lock(&ubifs_infos_lock);
1109 list_add_tail(&c->infos_list, &ubifs_infos);
1110 spin_unlock(&ubifs_infos_lock);
1111
1112 if (c->need_recovery) {
1113 if (mounted_read_only)
1114 ubifs_msg("recovery deferred");
1115 else {
1116 c->need_recovery = 0;
1117 ubifs_msg("recovery completed");
1118 }
1119 }
1120
1121 err = dbg_check_filesystem(c);
1122 if (err)
1123 goto out_infos;
1124
1125 ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
1126 c->vi.vol_id);
1127 if (mounted_read_only)
1128 ubifs_msg("mounted read-only");
1129 x = (long long)c->main_lebs * c->leb_size;
1130 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
1131 x, x >> 10, x >> 20, c->main_lebs);
1132 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1133 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
1134 x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
1135 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
1136 ubifs_msg("media format %d, latest format %d",
1137 c->fmt_version, UBIFS_FORMAT_VERSION);
1138
1139 dbg_msg("compiled on: " __DATE__ " at " __TIME__);
1140 dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
1141 dbg_msg("LEB size: %d bytes (%d KiB)",
1142 c->leb_size, c->leb_size / 1024);
1143 dbg_msg("data journal heads: %d",
1144 c->jhead_cnt - NONDATA_JHEADS_CNT);
1145 dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X"
1146 "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
1147 c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
1148 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
1149 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
1150 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
1151 dbg_msg("fast unmount: %d", c->fast_unmount);
1152 dbg_msg("big_lpt %d", c->big_lpt);
1153 dbg_msg("log LEBs: %d (%d - %d)",
1154 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
1155 dbg_msg("LPT area LEBs: %d (%d - %d)",
1156 c->lpt_lebs, c->lpt_first, c->lpt_last);
1157 dbg_msg("orphan area LEBs: %d (%d - %d)",
1158 c->orph_lebs, c->orph_first, c->orph_last);
1159 dbg_msg("main area LEBs: %d (%d - %d)",
1160 c->main_lebs, c->main_first, c->leb_cnt - 1);
1161 dbg_msg("index LEBs: %d", c->lst.idx_lebs);
1162 dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
1163 c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
1164 dbg_msg("key hash type: %d", c->key_hash_type);
1165 dbg_msg("tree fanout: %d", c->fanout);
1166 dbg_msg("reserved GC LEB: %d", c->gc_lnum);
1167 dbg_msg("first main LEB: %d", c->main_first);
1168 dbg_msg("dead watermark: %d", c->dead_wm);
1169 dbg_msg("dark watermark: %d", c->dark_wm);
1170 x = (long long)c->main_lebs * c->dark_wm;
1171 dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
1172 x, x >> 10, x >> 20);
1173 dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)",
1174 c->max_bud_bytes, c->max_bud_bytes >> 10,
1175 c->max_bud_bytes >> 20);
1176 dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
1177 c->bg_bud_bytes, c->bg_bud_bytes >> 10,
1178 c->bg_bud_bytes >> 20);
1179 dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)",
1180 c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
1181 dbg_msg("max. seq. number: %llu", c->max_sqnum);
1182 dbg_msg("commit number: %llu", c->cmt_no);
1183
1184 return 0;
1185
1186out_infos:
1187 spin_lock(&ubifs_infos_lock);
1188 list_del(&c->infos_list);
1189 spin_unlock(&ubifs_infos_lock);
1190out_orphans:
1191 free_orphans(c);
1192out_journal:
1193 destroy_journal(c);
1194out_lpt:
1195 ubifs_lpt_free(c, 0);
1196out_master:
1197 kfree(c->mst_node);
1198 kfree(c->rcvrd_mst_node);
1199 if (c->bgt)
1200 kthread_stop(c->bgt);
1201out_wbufs:
1202 free_wbufs(c);
1203out_cbuf:
1204 kfree(c->cbuf);
1205out_dereg:
1206 dbg_failure_mode_deregistration(c);
1207out_free:
1208 vfree(c->ileb_buf);
1209 vfree(c->sbuf);
1210 kfree(c->bottom_up_buf);
1211 UBIFS_DBG(vfree(c->dbg_buf));
1212 return err;
1213}
1214
1215/**
1216 * ubifs_umount - un-mount UBIFS file-system.
1217 * @c: UBIFS file-system description object
1218 *
1219 * Note, this function is called to free allocated resourced when un-mounting,
1220 * as well as free resources when an error occurred while we were half way
1221 * through mounting (error path cleanup function). So it has to make sure the
1222 * resource was actually allocated before freeing it.
1223 */
1224static void ubifs_umount(struct ubifs_info *c)
1225{
1226 dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
1227 c->vi.vol_id);
1228
1229 spin_lock(&ubifs_infos_lock);
1230 list_del(&c->infos_list);
1231 spin_unlock(&ubifs_infos_lock);
1232
1233 if (c->bgt)
1234 kthread_stop(c->bgt);
1235
1236 destroy_journal(c);
1237 free_wbufs(c);
1238 free_orphans(c);
1239 ubifs_lpt_free(c, 0);
1240
1241 kfree(c->cbuf);
1242 kfree(c->rcvrd_mst_node);
1243 kfree(c->mst_node);
1244 vfree(c->sbuf);
1245 kfree(c->bottom_up_buf);
1246 UBIFS_DBG(vfree(c->dbg_buf));
1247 vfree(c->ileb_buf);
1248 dbg_failure_mode_deregistration(c);
1249}
1250
1251/**
1252 * ubifs_remount_rw - re-mount in read-write mode.
1253 * @c: UBIFS file-system description object
1254 *
1255 * UBIFS avoids allocating many unnecessary resources when mounted in read-only
1256 * mode. This function allocates the needed resources and re-mounts UBIFS in
1257 * read-write mode.
1258 */
1259static int ubifs_remount_rw(struct ubifs_info *c)
1260{
1261 int err, lnum;
1262
1263 if (c->ro_media)
1264 return -EINVAL;
1265
1266 mutex_lock(&c->umount_mutex);
1267 c->remounting_rw = 1;
1268
1269 /* Check for enough free space */
1270 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
1271 ubifs_err("insufficient available space");
1272 err = -EINVAL;
1273 goto out;
1274 }
1275
1276 if (c->old_leb_cnt != c->leb_cnt) {
1277 struct ubifs_sb_node *sup;
1278
1279 sup = ubifs_read_sb_node(c);
1280 if (IS_ERR(sup)) {
1281 err = PTR_ERR(sup);
1282 goto out;
1283 }
1284 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
1285 err = ubifs_write_sb_node(c, sup);
1286 if (err)
1287 goto out;
1288 }
1289
1290 if (c->need_recovery) {
1291 ubifs_msg("completing deferred recovery");
1292 err = ubifs_write_rcvrd_mst_node(c);
1293 if (err)
1294 goto out;
1295 err = ubifs_recover_size(c);
1296 if (err)
1297 goto out;
1298 err = ubifs_clean_lebs(c, c->sbuf);
1299 if (err)
1300 goto out;
1301 err = ubifs_recover_inl_heads(c, c->sbuf);
1302 if (err)
1303 goto out;
1304 }
1305
1306 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
1307 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
1308 err = ubifs_write_master(c);
1309 if (err)
1310 goto out;
1311 }
1312
1313 c->ileb_buf = vmalloc(c->leb_size);
1314 if (!c->ileb_buf) {
1315 err = -ENOMEM;
1316 goto out;
1317 }
1318
1319 err = ubifs_lpt_init(c, 0, 1);
1320 if (err)
1321 goto out;
1322
1323 err = alloc_wbufs(c);
1324 if (err)
1325 goto out;
1326
1327 ubifs_create_buds_lists(c);
1328
1329 /* Create background thread */
1330 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
1331 if (!c->bgt)
1332 c->bgt = ERR_PTR(-EINVAL);
1333 if (IS_ERR(c->bgt)) {
1334 err = PTR_ERR(c->bgt);
1335 c->bgt = NULL;
1336 ubifs_err("cannot spawn \"%s\", error %d",
1337 c->bgt_name, err);
1338 return err;
1339 }
1340 wake_up_process(c->bgt);
1341
1342 c->orph_buf = vmalloc(c->leb_size);
1343 if (!c->orph_buf)
1344 return -ENOMEM;
1345
1346 /* Check for enough log space */
1347 lnum = c->lhead_lnum + 1;
1348 if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
1349 lnum = UBIFS_LOG_LNUM;
1350 if (lnum == c->ltail_lnum) {
1351 err = ubifs_consolidate_log(c);
1352 if (err)
1353 goto out;
1354 }
1355
1356 if (c->need_recovery)
1357 err = ubifs_rcvry_gc_commit(c);
1358 else
1359 err = take_gc_lnum(c);
1360 if (err)
1361 goto out;
1362
1363 if (c->need_recovery) {
1364 c->need_recovery = 0;
1365 ubifs_msg("deferred recovery completed");
1366 }
1367
1368 dbg_gen("re-mounted read-write");
1369 c->vfs_sb->s_flags &= ~MS_RDONLY;
1370 c->remounting_rw = 0;
1371 mutex_unlock(&c->umount_mutex);
1372 return 0;
1373
1374out:
1375 vfree(c->orph_buf);
1376 c->orph_buf = NULL;
1377 if (c->bgt) {
1378 kthread_stop(c->bgt);
1379 c->bgt = NULL;
1380 }
1381 free_wbufs(c);
1382 vfree(c->ileb_buf);
1383 c->ileb_buf = NULL;
1384 ubifs_lpt_free(c, 1);
1385 c->remounting_rw = 0;
1386 mutex_unlock(&c->umount_mutex);
1387 return err;
1388}
1389
1390/**
1391 * commit_on_unmount - commit the journal when un-mounting.
1392 * @c: UBIFS file-system description object
1393 *
1394 * This function is called during un-mounting and it commits the journal unless
1395 * the "fast unmount" mode is enabled. It also avoids committing the journal if
1396 * it contains too few data.
1397 *
1398 * Sometimes recovery requires the journal to be committed at least once, and
1399 * this function takes care about this.
1400 */
1401static void commit_on_unmount(struct ubifs_info *c)
1402{
1403 if (!c->fast_unmount) {
1404 long long bud_bytes;
1405
1406 spin_lock(&c->buds_lock);
1407 bud_bytes = c->bud_bytes;
1408 spin_unlock(&c->buds_lock);
1409 if (bud_bytes > c->leb_size)
1410 ubifs_run_commit(c);
1411 }
1412}
1413
1414/**
1415 * ubifs_remount_ro - re-mount in read-only mode.
1416 * @c: UBIFS file-system description object
1417 *
1418 * We rely on VFS to have stopped writing. Possibly the background thread could
1419 * be running a commit, however kthread_stop will wait in that case.
1420 */
1421static void ubifs_remount_ro(struct ubifs_info *c)
1422{
1423 int i, err;
1424
1425 ubifs_assert(!c->need_recovery);
1426 commit_on_unmount(c);
1427
1428 mutex_lock(&c->umount_mutex);
1429 if (c->bgt) {
1430 kthread_stop(c->bgt);
1431 c->bgt = NULL;
1432 }
1433
1434 for (i = 0; i < c->jhead_cnt; i++) {
1435 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1436 del_timer_sync(&c->jheads[i].wbuf.timer);
1437 }
1438
1439 if (!c->ro_media) {
1440 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1441 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1442 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1443 err = ubifs_write_master(c);
1444 if (err)
1445 ubifs_ro_mode(c, err);
1446 }
1447
1448 ubifs_destroy_idx_gc(c);
1449 free_wbufs(c);
1450 vfree(c->orph_buf);
1451 c->orph_buf = NULL;
1452 vfree(c->ileb_buf);
1453 c->ileb_buf = NULL;
1454 ubifs_lpt_free(c, 1);
1455 mutex_unlock(&c->umount_mutex);
1456}
1457
1458static void ubifs_put_super(struct super_block *sb)
1459{
1460 int i;
1461 struct ubifs_info *c = sb->s_fs_info;
1462
1463 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
1464 c->vi.vol_id);
1465 /*
1466 * The following asserts are only valid if there has not been a failure
1467 * of the media. For example, there will be dirty inodes if we failed
1468 * to write them back because of I/O errors.
1469 */
1470 ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
1471 ubifs_assert(c->budg_idx_growth == 0);
1472 ubifs_assert(c->budg_data_growth == 0);
1473
1474 /*
1475 * The 'c->umount_lock' prevents races between UBIFS memory shrinker
1476 * and file system un-mount. Namely, it prevents the shrinker from
1477 * picking this superblock for shrinking - it will be just skipped if
1478 * the mutex is locked.
1479 */
1480 mutex_lock(&c->umount_mutex);
1481 if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
1482 /*
1483 * First of all kill the background thread to make sure it does
1484 * not interfere with un-mounting and freeing resources.
1485 */
1486 if (c->bgt) {
1487 kthread_stop(c->bgt);
1488 c->bgt = NULL;
1489 }
1490
1491 /* Synchronize write-buffers */
1492 if (c->jheads)
1493 for (i = 0; i < c->jhead_cnt; i++) {
1494 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1495 del_timer_sync(&c->jheads[i].wbuf.timer);
1496 }
1497
1498 /*
1499 * On fatal errors c->ro_media is set to 1, in which case we do
1500 * not write the master node.
1501 */
1502 if (!c->ro_media) {
1503 /*
1504 * We are being cleanly unmounted which means the
1505 * orphans were killed - indicate this in the master
1506 * node. Also save the reserved GC LEB number.
1507 */
1508 int err;
1509
1510 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1511 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1512 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1513 err = ubifs_write_master(c);
1514 if (err)
1515 /*
1516 * Recovery will attempt to fix the master area
1517 * next mount, so we just print a message and
1518 * continue to unmount normally.
1519 */
1520 ubifs_err("failed to write master node, "
1521 "error %d", err);
1522 }
1523 }
1524
1525 ubifs_umount(c);
1526 bdi_destroy(&c->bdi);
1527 ubi_close_volume(c->ubi);
1528 mutex_unlock(&c->umount_mutex);
1529 kfree(c);
1530}
1531
1532static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1533{
1534 int err;
1535 struct ubifs_info *c = sb->s_fs_info;
1536
1537 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
1538
1539 err = ubifs_parse_options(c, data, 1);
1540 if (err) {
1541 ubifs_err("invalid or unknown remount parameter");
1542 return err;
1543 }
1544 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1545 err = ubifs_remount_rw(c);
1546 if (err)
1547 return err;
1548 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
1549 ubifs_remount_ro(c);
1550
1551 return 0;
1552}
1553
1554struct super_operations ubifs_super_operations = {
1555 .alloc_inode = ubifs_alloc_inode,
1556 .destroy_inode = ubifs_destroy_inode,
1557 .put_super = ubifs_put_super,
1558 .write_inode = ubifs_write_inode,
1559 .delete_inode = ubifs_delete_inode,
1560 .statfs = ubifs_statfs,
1561 .dirty_inode = ubifs_dirty_inode,
1562 .remount_fs = ubifs_remount_fs,
1563 .show_options = ubifs_show_options,
1564 .sync_fs = ubifs_sync_fs,
1565};
1566
1567/**
1568 * open_ubi - parse UBI device name string and open the UBI device.
1569 * @name: UBI volume name
1570 * @mode: UBI volume open mode
1571 *
1572 * There are several ways to specify UBI volumes when mounting UBIFS:
1573 * o ubiX_Y - UBI device number X, volume Y;
1574 * o ubiY - UBI device number 0, volume Y;
1575 * o ubiX:NAME - mount UBI device X, volume with name NAME;
1576 * o ubi:NAME - mount UBI device 0, volume with name NAME.
1577 *
1578 * Alternative '!' separator may be used instead of ':' (because some shells
1579 * like busybox may interpret ':' as an NFS host name separator). This function
1580 * returns ubi volume object in case of success and a negative error code in
1581 * case of failure.
1582 */
1583static struct ubi_volume_desc *open_ubi(const char *name, int mode)
1584{
1585 int dev, vol;
1586 char *endptr;
1587
1588 if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
1589 return ERR_PTR(-EINVAL);
1590
1591 /* ubi:NAME method */
1592 if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
1593 return ubi_open_volume_nm(0, name + 4, mode);
1594
1595 if (!isdigit(name[3]))
1596 return ERR_PTR(-EINVAL);
1597
1598 dev = simple_strtoul(name + 3, &endptr, 0);
1599
1600 /* ubiY method */
1601 if (*endptr == '\0')
1602 return ubi_open_volume(0, dev, mode);
1603
1604 /* ubiX_Y method */
1605 if (*endptr == '_' && isdigit(endptr[1])) {
1606 vol = simple_strtoul(endptr + 1, &endptr, 0);
1607 if (*endptr != '\0')
1608 return ERR_PTR(-EINVAL);
1609 return ubi_open_volume(dev, vol, mode);
1610 }
1611
1612 /* ubiX:NAME method */
1613 if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
1614 return ubi_open_volume_nm(dev, ++endptr, mode);
1615
1616 return ERR_PTR(-EINVAL);
1617}
1618
1619static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1620{
1621 struct ubi_volume_desc *ubi = sb->s_fs_info;
1622 struct ubifs_info *c;
1623 struct inode *root;
1624 int err;
1625
1626 c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
1627 if (!c)
1628 return -ENOMEM;
1629
1630 spin_lock_init(&c->cnt_lock);
1631 spin_lock_init(&c->cs_lock);
1632 spin_lock_init(&c->buds_lock);
1633 spin_lock_init(&c->space_lock);
1634 spin_lock_init(&c->orphan_lock);
1635 init_rwsem(&c->commit_sem);
1636 mutex_init(&c->lp_mutex);
1637 mutex_init(&c->tnc_mutex);
1638 mutex_init(&c->log_mutex);
1639 mutex_init(&c->mst_mutex);
1640 mutex_init(&c->umount_mutex);
1641 init_waitqueue_head(&c->cmt_wq);
1642 c->buds = RB_ROOT;
1643 c->old_idx = RB_ROOT;
1644 c->size_tree = RB_ROOT;
1645 c->orph_tree = RB_ROOT;
1646 INIT_LIST_HEAD(&c->infos_list);
1647 INIT_LIST_HEAD(&c->idx_gc);
1648 INIT_LIST_HEAD(&c->replay_list);
1649 INIT_LIST_HEAD(&c->replay_buds);
1650 INIT_LIST_HEAD(&c->uncat_list);
1651 INIT_LIST_HEAD(&c->empty_list);
1652 INIT_LIST_HEAD(&c->freeable_list);
1653 INIT_LIST_HEAD(&c->frdi_idx_list);
1654 INIT_LIST_HEAD(&c->unclean_leb_list);
1655 INIT_LIST_HEAD(&c->old_buds);
1656 INIT_LIST_HEAD(&c->orph_list);
1657 INIT_LIST_HEAD(&c->orph_new);
1658
1659 c->highest_inum = UBIFS_FIRST_INO;
1660 get_random_bytes(&c->vfs_gen, sizeof(int));
1661 c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
1662
1663 ubi_get_volume_info(ubi, &c->vi);
1664 ubi_get_device_info(c->vi.ubi_num, &c->di);
1665
1666 /* Re-open the UBI device in read-write mode */
1667 c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
1668 if (IS_ERR(c->ubi)) {
1669 err = PTR_ERR(c->ubi);
1670 goto out_free;
1671 }
1672
1673 /*
1674 * UBIFS provids 'backing_dev_info' in order to disable readahead. For
1675 * UBIFS, I/O is not deferred, it is done immediately in readpage,
1676 * which means the user would have to wait not just for their own I/O
1677 * but the readahead I/O as well i.e. completely pointless.
1678 *
1679 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1680 */
1681 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1682 c->bdi.unplug_io_fn = default_unplug_io_fn;
1683 err = bdi_init(&c->bdi);
1684 if (err)
1685 goto out_close;
1686
1687 err = ubifs_parse_options(c, data, 0);
1688 if (err)
1689 goto out_bdi;
1690
1691 c->vfs_sb = sb;
1692
1693 sb->s_fs_info = c;
1694 sb->s_magic = UBIFS_SUPER_MAGIC;
1695 sb->s_blocksize = UBIFS_BLOCK_SIZE;
1696 sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
1697 sb->s_dev = c->vi.cdev;
1698 sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
1699 if (c->max_inode_sz > MAX_LFS_FILESIZE)
1700 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
1701 sb->s_op = &ubifs_super_operations;
1702
1703 mutex_lock(&c->umount_mutex);
1704 err = mount_ubifs(c);
1705 if (err) {
1706 ubifs_assert(err < 0);
1707 goto out_unlock;
1708 }
1709
1710 /* Read the root inode */
1711 root = ubifs_iget(sb, UBIFS_ROOT_INO);
1712 if (IS_ERR(root)) {
1713 err = PTR_ERR(root);
1714 goto out_umount;
1715 }
1716
1717 sb->s_root = d_alloc_root(root);
1718 if (!sb->s_root)
1719 goto out_iput;
1720
1721 mutex_unlock(&c->umount_mutex);
1722
1723 return 0;
1724
1725out_iput:
1726 iput(root);
1727out_umount:
1728 ubifs_umount(c);
1729out_unlock:
1730 mutex_unlock(&c->umount_mutex);
1731out_bdi:
1732 bdi_destroy(&c->bdi);
1733out_close:
1734 ubi_close_volume(c->ubi);
1735out_free:
1736 kfree(c);
1737 return err;
1738}
1739
1740static int sb_test(struct super_block *sb, void *data)
1741{
1742 dev_t *dev = data;
1743
1744 return sb->s_dev == *dev;
1745}
1746
1747static int sb_set(struct super_block *sb, void *data)
1748{
1749 dev_t *dev = data;
1750
1751 sb->s_dev = *dev;
1752 return 0;
1753}
1754
1755static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
1756 const char *name, void *data, struct vfsmount *mnt)
1757{
1758 struct ubi_volume_desc *ubi;
1759 struct ubi_volume_info vi;
1760 struct super_block *sb;
1761 int err;
1762
1763 dbg_gen("name %s, flags %#x", name, flags);
1764
1765 /*
1766 * Get UBI device number and volume ID. Mount it read-only so far
1767 * because this might be a new mount point, and UBI allows only one
1768 * read-write user at a time.
1769 */
1770 ubi = open_ubi(name, UBI_READONLY);
1771 if (IS_ERR(ubi)) {
1772 ubifs_err("cannot open \"%s\", error %d",
1773 name, (int)PTR_ERR(ubi));
1774 return PTR_ERR(ubi);
1775 }
1776 ubi_get_volume_info(ubi, &vi);
1777
1778 dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
1779
1780 sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
1781 if (IS_ERR(sb)) {
1782 err = PTR_ERR(sb);
1783 goto out_close;
1784 }
1785
1786 if (sb->s_root) {
1787 /* A new mount point for already mounted UBIFS */
1788 dbg_gen("this ubi volume is already mounted");
1789 if ((flags ^ sb->s_flags) & MS_RDONLY) {
1790 err = -EBUSY;
1791 goto out_deact;
1792 }
1793 } else {
1794 sb->s_flags = flags;
1795 /*
1796 * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is
1797 * replaced by 'c'.
1798 */
1799 sb->s_fs_info = ubi;
1800 err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
1801 if (err)
1802 goto out_deact;
1803 /* We do not support atime */
1804 sb->s_flags |= MS_ACTIVE | MS_NOATIME;
1805 }
1806
1807 /* 'fill_super()' opens ubi again so we must close it here */
1808 ubi_close_volume(ubi);
1809
1810 return simple_set_mnt(mnt, sb);
1811
1812out_deact:
1813 up_write(&sb->s_umount);
1814 deactivate_super(sb);
1815out_close:
1816 ubi_close_volume(ubi);
1817 return err;
1818}
1819
1820static void ubifs_kill_sb(struct super_block *sb)
1821{
1822 struct ubifs_info *c = sb->s_fs_info;
1823
1824 /*
1825 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
1826 * in order to be outside BKL.
1827 */
1828 if (sb->s_root && !(sb->s_flags & MS_RDONLY))
1829 commit_on_unmount(c);
1830 /* The un-mount routine is actually done in put_super() */
1831 generic_shutdown_super(sb);
1832}
1833
1834static struct file_system_type ubifs_fs_type = {
1835 .name = "ubifs",
1836 .owner = THIS_MODULE,
1837 .get_sb = ubifs_get_sb,
1838 .kill_sb = ubifs_kill_sb
1839};
1840
1841/*
1842 * Inode slab cache constructor.
1843 */
1844static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
1845{
1846 struct ubifs_inode *ui = obj;
1847 inode_init_once(&ui->vfs_inode);
1848}
1849
1850static int __init ubifs_init(void)
1851{
1852 int err;
1853
1854 BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
1855
1856 /* Make sure node sizes are 8-byte aligned */
1857 BUILD_BUG_ON(UBIFS_CH_SZ & 7);
1858 BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7);
1859 BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
1860 BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
1861 BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
1862 BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
1863 BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7);
1864 BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7);
1865 BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7);
1866 BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7);
1867 BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
1868
1869 BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
1870 BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
1871 BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
1872 BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7);
1873 BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7);
1874 BUILD_BUG_ON(MIN_WRITE_SZ & 7);
1875
1876 /* Check min. node size */
1877 BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ);
1878 BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
1879 BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
1880 BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
1881
1882 BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
1883 BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
1884 BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
1885 BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ);
1886
1887 /* Defined node sizes */
1888 BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096);
1889 BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
1890 BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
1891 BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
1892
1893 /*
1894 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
1895 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
1896 */
1897 if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
1898 ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
1899 " at least 4096 bytes",
1900 (unsigned int)PAGE_CACHE_SIZE);
1901 return -EINVAL;
1902 }
1903
1904 err = register_filesystem(&ubifs_fs_type);
1905 if (err) {
1906 ubifs_err("cannot register file system, error %d", err);
1907 return err;
1908 }
1909
1910 err = -ENOMEM;
1911 ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
1912 sizeof(struct ubifs_inode), 0,
1913 SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
1914 &inode_slab_ctor);
1915 if (!ubifs_inode_slab)
1916 goto out_reg;
1917
1918 register_shrinker(&ubifs_shrinker_info);
1919
1920 err = ubifs_compressors_init();
1921 if (err)
1922 goto out_compr;
1923
1924 return 0;
1925
1926out_compr:
1927 unregister_shrinker(&ubifs_shrinker_info);
1928 kmem_cache_destroy(ubifs_inode_slab);
1929out_reg:
1930 unregister_filesystem(&ubifs_fs_type);
1931 return err;
1932}
1933/* late_initcall to let compressors initialize first */
1934late_initcall(ubifs_init);
1935
1936static void __exit ubifs_exit(void)
1937{
1938 ubifs_assert(list_empty(&ubifs_infos));
1939 ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
1940
1941 ubifs_compressors_exit();
1942 unregister_shrinker(&ubifs_shrinker_info);
1943 kmem_cache_destroy(ubifs_inode_slab);
1944 unregister_filesystem(&ubifs_fs_type);
1945}
1946module_exit(ubifs_exit);
1947
1948MODULE_LICENSE("GPL");
1949MODULE_VERSION(__stringify(UBIFS_VERSION));
1950MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
1951MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
new file mode 100644
index 000000000000..e909f4a96443
--- /dev/null
+++ b/fs/ubifs/tnc.c
@@ -0,0 +1,2956 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements TNC (Tree Node Cache) which caches indexing nodes of
25 * the UBIFS B-tree.
26 *
27 * At the moment the locking rules of the TNC tree are quite simple and
28 * straightforward. We just have a mutex and lock it when we traverse the
29 * tree. If a znode is not in memory, we read it from flash while still having
30 * the mutex locked.
31 */
32
33#include <linux/crc32.h>
34#include "ubifs.h"
35
36/*
37 * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
38 * @NAME_LESS: name corresponding to the first argument is less than second
39 * @NAME_MATCHES: names match
40 * @NAME_GREATER: name corresponding to the second argument is greater than
41 * first
42 * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
43 *
44 * These constants were introduce to improve readability.
45 */
46enum {
47 NAME_LESS = 0,
48 NAME_MATCHES = 1,
49 NAME_GREATER = 2,
50 NOT_ON_MEDIA = 3,
51};
52
53/**
54 * insert_old_idx - record an index node obsoleted since the last commit start.
55 * @c: UBIFS file-system description object
56 * @lnum: LEB number of obsoleted index node
57 * @offs: offset of obsoleted index node
58 *
59 * Returns %0 on success, and a negative error code on failure.
60 *
61 * For recovery, there must always be a complete intact version of the index on
62 * flash at all times. That is called the "old index". It is the index as at the
63 * time of the last successful commit. Many of the index nodes in the old index
64 * may be dirty, but they must not be erased until the next successful commit
65 * (at which point that index becomes the old index).
66 *
67 * That means that the garbage collection and the in-the-gaps method of
68 * committing must be able to determine if an index node is in the old index.
69 * Most of the old index nodes can be found by looking up the TNC using the
70 * 'lookup_znode()' function. However, some of the old index nodes may have
71 * been deleted from the current index or may have been changed so much that
72 * they cannot be easily found. In those cases, an entry is added to an RB-tree.
73 * That is what this function does. The RB-tree is ordered by LEB number and
74 * offset because they uniquely identify the old index node.
75 */
76static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
77{
78 struct ubifs_old_idx *old_idx, *o;
79 struct rb_node **p, *parent = NULL;
80
81 old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
82 if (unlikely(!old_idx))
83 return -ENOMEM;
84 old_idx->lnum = lnum;
85 old_idx->offs = offs;
86
87 p = &c->old_idx.rb_node;
88 while (*p) {
89 parent = *p;
90 o = rb_entry(parent, struct ubifs_old_idx, rb);
91 if (lnum < o->lnum)
92 p = &(*p)->rb_left;
93 else if (lnum > o->lnum)
94 p = &(*p)->rb_right;
95 else if (offs < o->offs)
96 p = &(*p)->rb_left;
97 else if (offs > o->offs)
98 p = &(*p)->rb_right;
99 else {
100 ubifs_err("old idx added twice!");
101 kfree(old_idx);
102 return 0;
103 }
104 }
105 rb_link_node(&old_idx->rb, parent, p);
106 rb_insert_color(&old_idx->rb, &c->old_idx);
107 return 0;
108}
109
110/**
111 * insert_old_idx_znode - record a znode obsoleted since last commit start.
112 * @c: UBIFS file-system description object
113 * @znode: znode of obsoleted index node
114 *
115 * Returns %0 on success, and a negative error code on failure.
116 */
117int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
118{
119 if (znode->parent) {
120 struct ubifs_zbranch *zbr;
121
122 zbr = &znode->parent->zbranch[znode->iip];
123 if (zbr->len)
124 return insert_old_idx(c, zbr->lnum, zbr->offs);
125 } else
126 if (c->zroot.len)
127 return insert_old_idx(c, c->zroot.lnum,
128 c->zroot.offs);
129 return 0;
130}
131
132/**
133 * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
134 * @c: UBIFS file-system description object
135 * @znode: znode of obsoleted index node
136 *
137 * Returns %0 on success, and a negative error code on failure.
138 */
139static int ins_clr_old_idx_znode(struct ubifs_info *c,
140 struct ubifs_znode *znode)
141{
142 int err;
143
144 if (znode->parent) {
145 struct ubifs_zbranch *zbr;
146
147 zbr = &znode->parent->zbranch[znode->iip];
148 if (zbr->len) {
149 err = insert_old_idx(c, zbr->lnum, zbr->offs);
150 if (err)
151 return err;
152 zbr->lnum = 0;
153 zbr->offs = 0;
154 zbr->len = 0;
155 }
156 } else
157 if (c->zroot.len) {
158 err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
159 if (err)
160 return err;
161 c->zroot.lnum = 0;
162 c->zroot.offs = 0;
163 c->zroot.len = 0;
164 }
165 return 0;
166}
167
168/**
169 * destroy_old_idx - destroy the old_idx RB-tree.
170 * @c: UBIFS file-system description object
171 *
172 * During start commit, the old_idx RB-tree is used to avoid overwriting index
173 * nodes that were in the index last commit but have since been deleted. This
174 * is necessary for recovery i.e. the old index must be kept intact until the
175 * new index is successfully written. The old-idx RB-tree is used for the
176 * in-the-gaps method of writing index nodes and is destroyed every commit.
177 */
178void destroy_old_idx(struct ubifs_info *c)
179{
180 struct rb_node *this = c->old_idx.rb_node;
181 struct ubifs_old_idx *old_idx;
182
183 while (this) {
184 if (this->rb_left) {
185 this = this->rb_left;
186 continue;
187 } else if (this->rb_right) {
188 this = this->rb_right;
189 continue;
190 }
191 old_idx = rb_entry(this, struct ubifs_old_idx, rb);
192 this = rb_parent(this);
193 if (this) {
194 if (this->rb_left == &old_idx->rb)
195 this->rb_left = NULL;
196 else
197 this->rb_right = NULL;
198 }
199 kfree(old_idx);
200 }
201 c->old_idx = RB_ROOT;
202}
203
204/**
205 * copy_znode - copy a dirty znode.
206 * @c: UBIFS file-system description object
207 * @znode: znode to copy
208 *
209 * A dirty znode being committed may not be changed, so it is copied.
210 */
211static struct ubifs_znode *copy_znode(struct ubifs_info *c,
212 struct ubifs_znode *znode)
213{
214 struct ubifs_znode *zn;
215
216 zn = kmalloc(c->max_znode_sz, GFP_NOFS);
217 if (unlikely(!zn))
218 return ERR_PTR(-ENOMEM);
219
220 memcpy(zn, znode, c->max_znode_sz);
221 zn->cnext = NULL;
222 __set_bit(DIRTY_ZNODE, &zn->flags);
223 __clear_bit(COW_ZNODE, &zn->flags);
224
225 ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
226 __set_bit(OBSOLETE_ZNODE, &znode->flags);
227
228 if (znode->level != 0) {
229 int i;
230 const int n = zn->child_cnt;
231
232 /* The children now have new parent */
233 for (i = 0; i < n; i++) {
234 struct ubifs_zbranch *zbr = &zn->zbranch[i];
235
236 if (zbr->znode)
237 zbr->znode->parent = zn;
238 }
239 }
240
241 atomic_long_inc(&c->dirty_zn_cnt);
242 return zn;
243}
244
245/**
246 * add_idx_dirt - add dirt due to a dirty znode.
247 * @c: UBIFS file-system description object
248 * @lnum: LEB number of index node
249 * @dirt: size of index node
250 *
251 * This function updates lprops dirty space and the new size of the index.
252 */
253static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
254{
255 c->calc_idx_sz -= ALIGN(dirt, 8);
256 return ubifs_add_dirt(c, lnum, dirt);
257}
258
259/**
260 * dirty_cow_znode - ensure a znode is not being committed.
261 * @c: UBIFS file-system description object
262 * @zbr: branch of znode to check
263 *
264 * Returns dirtied znode on success or negative error code on failure.
265 */
266static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
267 struct ubifs_zbranch *zbr)
268{
269 struct ubifs_znode *znode = zbr->znode;
270 struct ubifs_znode *zn;
271 int err;
272
273 if (!test_bit(COW_ZNODE, &znode->flags)) {
274 /* znode is not being committed */
275 if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
276 atomic_long_inc(&c->dirty_zn_cnt);
277 atomic_long_dec(&c->clean_zn_cnt);
278 atomic_long_dec(&ubifs_clean_zn_cnt);
279 err = add_idx_dirt(c, zbr->lnum, zbr->len);
280 if (unlikely(err))
281 return ERR_PTR(err);
282 }
283 return znode;
284 }
285
286 zn = copy_znode(c, znode);
287 if (unlikely(IS_ERR(zn)))
288 return zn;
289
290 if (zbr->len) {
291 err = insert_old_idx(c, zbr->lnum, zbr->offs);
292 if (unlikely(err))
293 return ERR_PTR(err);
294 err = add_idx_dirt(c, zbr->lnum, zbr->len);
295 } else
296 err = 0;
297
298 zbr->znode = zn;
299 zbr->lnum = 0;
300 zbr->offs = 0;
301 zbr->len = 0;
302
303 if (unlikely(err))
304 return ERR_PTR(err);
305 return zn;
306}
307
308/**
309 * lnc_add - add a leaf node to the leaf node cache.
310 * @c: UBIFS file-system description object
311 * @zbr: zbranch of leaf node
312 * @node: leaf node
313 *
314 * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
315 * purpose of the leaf node cache is to save re-reading the same leaf node over
316 * and over again. Most things are cached by VFS, however the file system must
317 * cache directory entries for readdir and for resolving hash collisions. The
318 * present implementation of the leaf node cache is extremely simple, and
319 * allows for error returns that are not used but that may be needed if a more
320 * complex implementation is created.
321 *
322 * Note, this function does not add the @node object to LNC directly, but
323 * allocates a copy of the object and adds the copy to LNC. The reason for this
324 * is that @node has been allocated outside of the TNC subsystem and will be
325 * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
326 * may be changed at any time, e.g. freed by the shrinker.
327 */
328static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
329 const void *node)
330{
331 int err;
332 void *lnc_node;
333 const struct ubifs_dent_node *dent = node;
334
335 ubifs_assert(!zbr->leaf);
336 ubifs_assert(zbr->len != 0);
337 ubifs_assert(is_hash_key(c, &zbr->key));
338
339 err = ubifs_validate_entry(c, dent);
340 if (err) {
341 dbg_dump_stack();
342 dbg_dump_node(c, dent);
343 return err;
344 }
345
346 lnc_node = kmalloc(zbr->len, GFP_NOFS);
347 if (!lnc_node)
348 /* We don't have to have the cache, so no error */
349 return 0;
350
351 memcpy(lnc_node, node, zbr->len);
352 zbr->leaf = lnc_node;
353 return 0;
354}
355
356 /**
357 * lnc_add_directly - add a leaf node to the leaf-node-cache.
358 * @c: UBIFS file-system description object
359 * @zbr: zbranch of leaf node
360 * @node: leaf node
361 *
362 * This function is similar to 'lnc_add()', but it does not create a copy of
363 * @node but inserts @node to TNC directly.
364 */
365static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
366 void *node)
367{
368 int err;
369
370 ubifs_assert(!zbr->leaf);
371 ubifs_assert(zbr->len != 0);
372
373 err = ubifs_validate_entry(c, node);
374 if (err) {
375 dbg_dump_stack();
376 dbg_dump_node(c, node);
377 return err;
378 }
379
380 zbr->leaf = node;
381 return 0;
382}
383
384/**
385 * lnc_free - remove a leaf node from the leaf node cache.
386 * @zbr: zbranch of leaf node
387 * @node: leaf node
388 */
389static void lnc_free(struct ubifs_zbranch *zbr)
390{
391 if (!zbr->leaf)
392 return;
393 kfree(zbr->leaf);
394 zbr->leaf = NULL;
395}
396
397/**
398 * tnc_read_node_nm - read a "hashed" leaf node.
399 * @c: UBIFS file-system description object
400 * @zbr: key and position of the node
401 * @node: node is returned here
402 *
403 * This function reads a "hashed" node defined by @zbr from the leaf node cache
404 * (in it is there) or from the hash media, in which case the node is also
405 * added to LNC. Returns zero in case of success or a negative negative error
406 * code in case of failure.
407 */
408static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
409 void *node)
410{
411 int err;
412
413 ubifs_assert(is_hash_key(c, &zbr->key));
414
415 if (zbr->leaf) {
416 /* Read from the leaf node cache */
417 ubifs_assert(zbr->len != 0);
418 memcpy(node, zbr->leaf, zbr->len);
419 return 0;
420 }
421
422 err = ubifs_tnc_read_node(c, zbr, node);
423 if (err)
424 return err;
425
426 /* Add the node to the leaf node cache */
427 err = lnc_add(c, zbr, node);
428 return err;
429}
430
431/**
432 * try_read_node - read a node if it is a node.
433 * @c: UBIFS file-system description object
434 * @buf: buffer to read to
435 * @type: node type
436 * @len: node length (not aligned)
437 * @lnum: LEB number of node to read
438 * @offs: offset of node to read
439 *
440 * This function tries to read a node of known type and length, checks it and
441 * stores it in @buf. This function returns %1 if a node is present and %0 if
442 * a node is not present. A negative error code is returned for I/O errors.
443 * This function performs that same function as ubifs_read_node except that
444 * it does not require that there is actually a node present and instead
445 * the return code indicates if a node was read.
446 */
447static int try_read_node(const struct ubifs_info *c, void *buf, int type,
448 int len, int lnum, int offs)
449{
450 int err, node_len;
451 struct ubifs_ch *ch = buf;
452 uint32_t crc, node_crc;
453
454 dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
455
456 err = ubi_read(c->ubi, lnum, buf, offs, len);
457 if (err) {
458 ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
459 type, lnum, offs, err);
460 return err;
461 }
462
463 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
464 return 0;
465
466 if (ch->node_type != type)
467 return 0;
468
469 node_len = le32_to_cpu(ch->len);
470 if (node_len != len)
471 return 0;
472
473 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
474 node_crc = le32_to_cpu(ch->crc);
475 if (crc != node_crc)
476 return 0;
477
478 return 1;
479}
480
481/**
482 * fallible_read_node - try to read a leaf node.
483 * @c: UBIFS file-system description object
484 * @key: key of node to read
485 * @zbr: position of node
486 * @node: node returned
487 *
488 * This function tries to read a node and returns %1 if the node is read, %0
489 * if the node is not present, and a negative error code in the case of error.
490 */
491static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
492 struct ubifs_zbranch *zbr, void *node)
493{
494 int ret;
495
496 dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
497
498 ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
499 zbr->offs);
500 if (ret == 1) {
501 union ubifs_key node_key;
502 struct ubifs_dent_node *dent = node;
503
504 /* All nodes have key in the same place */
505 key_read(c, &dent->key, &node_key);
506 if (keys_cmp(c, key, &node_key) != 0)
507 ret = 0;
508 }
509 if (ret == 0)
510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
512 return ret;
513}
514
515/**
516 * matches_name - determine if a direntry or xattr entry matches a given name.
517 * @c: UBIFS file-system description object
518 * @zbr: zbranch of dent
519 * @nm: name to match
520 *
521 * This function checks if xentry/direntry referred by zbranch @zbr matches name
522 * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
523 * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
524 * of failure, a negative error code is returned.
525 */
526static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
527 const struct qstr *nm)
528{
529 struct ubifs_dent_node *dent;
530 int nlen, err;
531
532 /* If possible, match against the dent in the leaf node cache */
533 if (!zbr->leaf) {
534 dent = kmalloc(zbr->len, GFP_NOFS);
535 if (!dent)
536 return -ENOMEM;
537
538 err = ubifs_tnc_read_node(c, zbr, dent);
539 if (err)
540 goto out_free;
541
542 /* Add the node to the leaf node cache */
543 err = lnc_add_directly(c, zbr, dent);
544 if (err)
545 goto out_free;
546 } else
547 dent = zbr->leaf;
548
549 nlen = le16_to_cpu(dent->nlen);
550 err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
551 if (err == 0) {
552 if (nlen == nm->len)
553 return NAME_MATCHES;
554 else if (nlen < nm->len)
555 return NAME_LESS;
556 else
557 return NAME_GREATER;
558 } else if (err < 0)
559 return NAME_LESS;
560 else
561 return NAME_GREATER;
562
563out_free:
564 kfree(dent);
565 return err;
566}
567
568/**
569 * get_znode - get a TNC znode that may not be loaded yet.
570 * @c: UBIFS file-system description object
571 * @znode: parent znode
572 * @n: znode branch slot number
573 *
574 * This function returns the znode or a negative error code.
575 */
576static struct ubifs_znode *get_znode(struct ubifs_info *c,
577 struct ubifs_znode *znode, int n)
578{
579 struct ubifs_zbranch *zbr;
580
581 zbr = &znode->zbranch[n];
582 if (zbr->znode)
583 znode = zbr->znode;
584 else
585 znode = ubifs_load_znode(c, zbr, znode, n);
586 return znode;
587}
588
589/**
590 * tnc_next - find next TNC entry.
591 * @c: UBIFS file-system description object
592 * @zn: znode is passed and returned here
593 * @n: znode branch slot number is passed and returned here
594 *
595 * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
596 * no next entry, or a negative error code otherwise.
597 */
598static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
599{
600 struct ubifs_znode *znode = *zn;
601 int nn = *n;
602
603 nn += 1;
604 if (nn < znode->child_cnt) {
605 *n = nn;
606 return 0;
607 }
608 while (1) {
609 struct ubifs_znode *zp;
610
611 zp = znode->parent;
612 if (!zp)
613 return -ENOENT;
614 nn = znode->iip + 1;
615 znode = zp;
616 if (nn < znode->child_cnt) {
617 znode = get_znode(c, znode, nn);
618 if (IS_ERR(znode))
619 return PTR_ERR(znode);
620 while (znode->level != 0) {
621 znode = get_znode(c, znode, 0);
622 if (IS_ERR(znode))
623 return PTR_ERR(znode);
624 }
625 nn = 0;
626 break;
627 }
628 }
629 *zn = znode;
630 *n = nn;
631 return 0;
632}
633
634/**
635 * tnc_prev - find previous TNC entry.
636 * @c: UBIFS file-system description object
637 * @zn: znode is returned here
638 * @n: znode branch slot number is passed and returned here
639 *
640 * This function returns %0 if the previous TNC entry is found, %-ENOENT if
641 * there is no next entry, or a negative error code otherwise.
642 */
643static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
644{
645 struct ubifs_znode *znode = *zn;
646 int nn = *n;
647
648 if (nn > 0) {
649 *n = nn - 1;
650 return 0;
651 }
652 while (1) {
653 struct ubifs_znode *zp;
654
655 zp = znode->parent;
656 if (!zp)
657 return -ENOENT;
658 nn = znode->iip - 1;
659 znode = zp;
660 if (nn >= 0) {
661 znode = get_znode(c, znode, nn);
662 if (IS_ERR(znode))
663 return PTR_ERR(znode);
664 while (znode->level != 0) {
665 nn = znode->child_cnt - 1;
666 znode = get_znode(c, znode, nn);
667 if (IS_ERR(znode))
668 return PTR_ERR(znode);
669 }
670 nn = znode->child_cnt - 1;
671 break;
672 }
673 }
674 *zn = znode;
675 *n = nn;
676 return 0;
677}
678
679/**
680 * resolve_collision - resolve a collision.
681 * @c: UBIFS file-system description object
682 * @key: key of a directory or extended attribute entry
683 * @zn: znode is returned here
684 * @n: zbranch number is passed and returned here
685 * @nm: name of the entry
686 *
687 * This function is called for "hashed" keys to make sure that the found key
688 * really corresponds to the looked up node (directory or extended attribute
689 * entry). It returns %1 and sets @zn and @n if the collision is resolved.
690 * %0 is returned if @nm is not found and @zn and @n are set to the previous
691 * entry, i.e. to the entry after which @nm could follow if it were in TNC.
692 * This means that @n may be set to %-1 if the leftmost key in @zn is the
693 * previous one. A negative error code is returned on failures.
694 */
695static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
696 struct ubifs_znode **zn, int *n,
697 const struct qstr *nm)
698{
699 int err;
700
701 err = matches_name(c, &(*zn)->zbranch[*n], nm);
702 if (unlikely(err < 0))
703 return err;
704 if (err == NAME_MATCHES)
705 return 1;
706
707 if (err == NAME_GREATER) {
708 /* Look left */
709 while (1) {
710 err = tnc_prev(c, zn, n);
711 if (err == -ENOENT) {
712 ubifs_assert(*n == 0);
713 *n = -1;
714 return 0;
715 }
716 if (err < 0)
717 return err;
718 if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
719 /*
720 * We have found the branch after which we would
721 * like to insert, but inserting in this znode
722 * may still be wrong. Consider the following 3
723 * znodes, in the case where we are resolving a
724 * collision with Key2.
725 *
726 * znode zp
727 * ----------------------
728 * level 1 | Key0 | Key1 |
729 * -----------------------
730 * | |
731 * znode za | | znode zb
732 * ------------ ------------
733 * level 0 | Key0 | | Key2 |
734 * ------------ ------------
735 *
736 * The lookup finds Key2 in znode zb. Lets say
737 * there is no match and the name is greater so
738 * we look left. When we find Key0, we end up
739 * here. If we return now, we will insert into
740 * znode za at slot n = 1. But that is invalid
741 * according to the parent's keys. Key2 must
742 * be inserted into znode zb.
743 *
744 * Note, this problem is not relevant for the
745 * case when we go right, because
746 * 'tnc_insert()' would correct the parent key.
747 */
748 if (*n == (*zn)->child_cnt - 1) {
749 err = tnc_next(c, zn, n);
750 if (err) {
751 /* Should be impossible */
752 ubifs_assert(0);
753 if (err == -ENOENT)
754 err = -EINVAL;
755 return err;
756 }
757 ubifs_assert(*n == 0);
758 *n = -1;
759 }
760 return 0;
761 }
762 err = matches_name(c, &(*zn)->zbranch[*n], nm);
763 if (err < 0)
764 return err;
765 if (err == NAME_LESS)
766 return 0;
767 if (err == NAME_MATCHES)
768 return 1;
769 ubifs_assert(err == NAME_GREATER);
770 }
771 } else {
772 int nn = *n;
773 struct ubifs_znode *znode = *zn;
774
775 /* Look right */
776 while (1) {
777 err = tnc_next(c, &znode, &nn);
778 if (err == -ENOENT)
779 return 0;
780 if (err < 0)
781 return err;
782 if (keys_cmp(c, &znode->zbranch[nn].key, key))
783 return 0;
784 err = matches_name(c, &znode->zbranch[nn], nm);
785 if (err < 0)
786 return err;
787 if (err == NAME_GREATER)
788 return 0;
789 *zn = znode;
790 *n = nn;
791 if (err == NAME_MATCHES)
792 return 1;
793 ubifs_assert(err == NAME_LESS);
794 }
795 }
796}
797
798/**
799 * fallible_matches_name - determine if a dent matches a given name.
800 * @c: UBIFS file-system description object
801 * @zbr: zbranch of dent
802 * @nm: name to match
803 *
804 * This is a "fallible" version of 'matches_name()' function which does not
805 * panic if the direntry/xentry referred by @zbr does not exist on the media.
806 *
807 * This function checks if xentry/direntry referred by zbranch @zbr matches name
808 * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
809 * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
810 * if xentry/direntry referred by @zbr does not exist on the media. A negative
811 * error code is returned in case of failure.
812 */
813static int fallible_matches_name(struct ubifs_info *c,
814 struct ubifs_zbranch *zbr,
815 const struct qstr *nm)
816{
817 struct ubifs_dent_node *dent;
818 int nlen, err;
819
820 /* If possible, match against the dent in the leaf node cache */
821 if (!zbr->leaf) {
822 dent = kmalloc(zbr->len, GFP_NOFS);
823 if (!dent)
824 return -ENOMEM;
825
826 err = fallible_read_node(c, &zbr->key, zbr, dent);
827 if (err < 0)
828 goto out_free;
829 if (err == 0) {
830 /* The node was not present */
831 err = NOT_ON_MEDIA;
832 goto out_free;
833 }
834 ubifs_assert(err == 1);
835
836 err = lnc_add_directly(c, zbr, dent);
837 if (err)
838 goto out_free;
839 } else
840 dent = zbr->leaf;
841
842 nlen = le16_to_cpu(dent->nlen);
843 err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
844 if (err == 0) {
845 if (nlen == nm->len)
846 return NAME_MATCHES;
847 else if (nlen < nm->len)
848 return NAME_LESS;
849 else
850 return NAME_GREATER;
851 } else if (err < 0)
852 return NAME_LESS;
853 else
854 return NAME_GREATER;
855
856out_free:
857 kfree(dent);
858 return err;
859}
860
861/**
862 * fallible_resolve_collision - resolve a collision even if nodes are missing.
863 * @c: UBIFS file-system description object
864 * @key: key
865 * @zn: znode is returned here
866 * @n: branch number is passed and returned here
867 * @nm: name of directory entry
868 * @adding: indicates caller is adding a key to the TNC
869 *
870 * This is a "fallible" version of the 'resolve_collision()' function which
871 * does not panic if one of the nodes referred to by TNC does not exist on the
872 * media. This may happen when replaying the journal if a deleted node was
873 * Garbage-collected and the commit was not done. A branch that refers to a node
874 * that is not present is called a dangling branch. The following are the return
875 * codes for this function:
876 * o if @nm was found, %1 is returned and @zn and @n are set to the found
877 * branch;
878 * o if we are @adding and @nm was not found, %0 is returned;
879 * o if we are not @adding and @nm was not found, but a dangling branch was
880 * found, then %1 is returned and @zn and @n are set to the dangling branch;
881 * o a negative error code is returned in case of failure.
882 */
883static int fallible_resolve_collision(struct ubifs_info *c,
884 const union ubifs_key *key,
885 struct ubifs_znode **zn, int *n,
886 const struct qstr *nm, int adding)
887{
888 struct ubifs_znode *o_znode = NULL, *znode = *zn;
889 int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
890
891 cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
892 if (unlikely(cmp < 0))
893 return cmp;
894 if (cmp == NAME_MATCHES)
895 return 1;
896 if (cmp == NOT_ON_MEDIA) {
897 o_znode = znode;
898 o_n = nn;
899 /*
900 * We are unlucky and hit a dangling branch straight away.
901 * Now we do not really know where to go to find the needed
902 * branch - to the left or to the right. Well, let's try left.
903 */
904 unsure = 1;
905 } else if (!adding)
906 unsure = 1; /* Remove a dangling branch wherever it is */
907
908 if (cmp == NAME_GREATER || unsure) {
909 /* Look left */
910 while (1) {
911 err = tnc_prev(c, zn, n);
912 if (err == -ENOENT) {
913 ubifs_assert(*n == 0);
914 *n = -1;
915 break;
916 }
917 if (err < 0)
918 return err;
919 if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
920 /* See comments in 'resolve_collision()' */
921 if (*n == (*zn)->child_cnt - 1) {
922 err = tnc_next(c, zn, n);
923 if (err) {
924 /* Should be impossible */
925 ubifs_assert(0);
926 if (err == -ENOENT)
927 err = -EINVAL;
928 return err;
929 }
930 ubifs_assert(*n == 0);
931 *n = -1;
932 }
933 break;
934 }
935 err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
936 if (err < 0)
937 return err;
938 if (err == NAME_MATCHES)
939 return 1;
940 if (err == NOT_ON_MEDIA) {
941 o_znode = *zn;
942 o_n = *n;
943 continue;
944 }
945 if (!adding)
946 continue;
947 if (err == NAME_LESS)
948 break;
949 else
950 unsure = 0;
951 }
952 }
953
954 if (cmp == NAME_LESS || unsure) {
955 /* Look right */
956 *zn = znode;
957 *n = nn;
958 while (1) {
959 err = tnc_next(c, &znode, &nn);
960 if (err == -ENOENT)
961 break;
962 if (err < 0)
963 return err;
964 if (keys_cmp(c, &znode->zbranch[nn].key, key))
965 break;
966 err = fallible_matches_name(c, &znode->zbranch[nn], nm);
967 if (err < 0)
968 return err;
969 if (err == NAME_GREATER)
970 break;
971 *zn = znode;
972 *n = nn;
973 if (err == NAME_MATCHES)
974 return 1;
975 if (err == NOT_ON_MEDIA) {
976 o_znode = znode;
977 o_n = nn;
978 }
979 }
980 }
981
982 /* Never match a dangling branch when adding */
983 if (adding || !o_znode)
984 return 0;
985
986 dbg_mnt("dangling match LEB %d:%d len %d %s",
987 o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
988 o_znode->zbranch[o_n].len, DBGKEY(key));
989 *zn = o_znode;
990 *n = o_n;
991 return 1;
992}
993
994/**
995 * matches_position - determine if a zbranch matches a given position.
996 * @zbr: zbranch of dent
997 * @lnum: LEB number of dent to match
998 * @offs: offset of dent to match
999 *
1000 * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
1001 */
1002static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
1003{
1004 if (zbr->lnum == lnum && zbr->offs == offs)
1005 return 1;
1006 else
1007 return 0;
1008}
1009
1010/**
1011 * resolve_collision_directly - resolve a collision directly.
1012 * @c: UBIFS file-system description object
1013 * @key: key of directory entry
1014 * @zn: znode is passed and returned here
1015 * @n: zbranch number is passed and returned here
1016 * @lnum: LEB number of dent node to match
1017 * @offs: offset of dent node to match
1018 *
1019 * This function is used for "hashed" keys to make sure the found directory or
1020 * extended attribute entry node is what was looked for. It is used when the
1021 * flash address of the right node is known (@lnum:@offs) which makes it much
1022 * easier to resolve collisions (no need to read entries and match full
1023 * names). This function returns %1 and sets @zn and @n if the collision is
1024 * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
1025 * previous directory entry. Otherwise a negative error code is returned.
1026 */
1027static int resolve_collision_directly(struct ubifs_info *c,
1028 const union ubifs_key *key,
1029 struct ubifs_znode **zn, int *n,
1030 int lnum, int offs)
1031{
1032 struct ubifs_znode *znode;
1033 int nn, err;
1034
1035 znode = *zn;
1036 nn = *n;
1037 if (matches_position(&znode->zbranch[nn], lnum, offs))
1038 return 1;
1039
1040 /* Look left */
1041 while (1) {
1042 err = tnc_prev(c, &znode, &nn);
1043 if (err == -ENOENT)
1044 break;
1045 if (err < 0)
1046 return err;
1047 if (keys_cmp(c, &znode->zbranch[nn].key, key))
1048 break;
1049 if (matches_position(&znode->zbranch[nn], lnum, offs)) {
1050 *zn = znode;
1051 *n = nn;
1052 return 1;
1053 }
1054 }
1055
1056 /* Look right */
1057 znode = *zn;
1058 nn = *n;
1059 while (1) {
1060 err = tnc_next(c, &znode, &nn);
1061 if (err == -ENOENT)
1062 return 0;
1063 if (err < 0)
1064 return err;
1065 if (keys_cmp(c, &znode->zbranch[nn].key, key))
1066 return 0;
1067 *zn = znode;
1068 *n = nn;
1069 if (matches_position(&znode->zbranch[nn], lnum, offs))
1070 return 1;
1071 }
1072}
1073
1074/**
1075 * dirty_cow_bottom_up - dirty a znode and its ancestors.
1076 * @c: UBIFS file-system description object
1077 * @znode: znode to dirty
1078 *
1079 * If we do not have a unique key that resides in a znode, then we cannot
1080 * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
1081 * This function records the path back to the last dirty ancestor, and then
1082 * dirties the znodes on that path.
1083 */
1084static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
1085 struct ubifs_znode *znode)
1086{
1087 struct ubifs_znode *zp;
1088 int *path = c->bottom_up_buf, p = 0;
1089
1090 ubifs_assert(c->zroot.znode);
1091 ubifs_assert(znode);
1092 if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
1093 kfree(c->bottom_up_buf);
1094 c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
1095 GFP_NOFS);
1096 if (!c->bottom_up_buf)
1097 return ERR_PTR(-ENOMEM);
1098 path = c->bottom_up_buf;
1099 }
1100 if (c->zroot.znode->level) {
1101 /* Go up until parent is dirty */
1102 while (1) {
1103 int n;
1104
1105 zp = znode->parent;
1106 if (!zp)
1107 break;
1108 n = znode->iip;
1109 ubifs_assert(p < c->zroot.znode->level);
1110 path[p++] = n;
1111 if (!zp->cnext && ubifs_zn_dirty(znode))
1112 break;
1113 znode = zp;
1114 }
1115 }
1116
1117 /* Come back down, dirtying as we go */
1118 while (1) {
1119 struct ubifs_zbranch *zbr;
1120
1121 zp = znode->parent;
1122 if (zp) {
1123 ubifs_assert(path[p - 1] >= 0);
1124 ubifs_assert(path[p - 1] < zp->child_cnt);
1125 zbr = &zp->zbranch[path[--p]];
1126 znode = dirty_cow_znode(c, zbr);
1127 } else {
1128 ubifs_assert(znode == c->zroot.znode);
1129 znode = dirty_cow_znode(c, &c->zroot);
1130 }
1131 if (unlikely(IS_ERR(znode)) || !p)
1132 break;
1133 ubifs_assert(path[p - 1] >= 0);
1134 ubifs_assert(path[p - 1] < znode->child_cnt);
1135 znode = znode->zbranch[path[p - 1]].znode;
1136 }
1137
1138 return znode;
1139}
1140
1141/**
1142 * ubifs_lookup_level0 - search for zero-level znode.
1143 * @c: UBIFS file-system description object
1144 * @key: key to lookup
1145 * @zn: znode is returned here
1146 * @n: znode branch slot number is returned here
1147 *
1148 * This function looks up the TNC tree and search for zero-level znode which
1149 * refers key @key. The found zero-level znode is returned in @zn. There are 3
1150 * cases:
1151 * o exact match, i.e. the found zero-level znode contains key @key, then %1
1152 * is returned and slot number of the matched branch is stored in @n;
1153 * o not exact match, which means that zero-level znode does not contain
1154 * @key, then %0 is returned and slot number of the closed branch is stored
1155 * in @n;
1156 * o @key is so small that it is even less than the lowest key of the
1157 * leftmost zero-level node, then %0 is returned and %0 is stored in @n.
1158 *
1159 * Note, when the TNC tree is traversed, some znodes may be absent, then this
1160 * function reads corresponding indexing nodes and inserts them to TNC. In
1161 * case of failure, a negative error code is returned.
1162 */
1163int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1164 struct ubifs_znode **zn, int *n)
1165{
1166 int err, exact;
1167 struct ubifs_znode *znode;
1168 unsigned long time = get_seconds();
1169
1170 dbg_tnc("search key %s", DBGKEY(key));
1171
1172 znode = c->zroot.znode;
1173 if (unlikely(!znode)) {
1174 znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
1175 if (IS_ERR(znode))
1176 return PTR_ERR(znode);
1177 }
1178
1179 znode->time = time;
1180
1181 while (1) {
1182 struct ubifs_zbranch *zbr;
1183
1184 exact = ubifs_search_zbranch(c, znode, key, n);
1185
1186 if (znode->level == 0)
1187 break;
1188
1189 if (*n < 0)
1190 *n = 0;
1191 zbr = &znode->zbranch[*n];
1192
1193 if (zbr->znode) {
1194 znode->time = time;
1195 znode = zbr->znode;
1196 continue;
1197 }
1198
1199 /* znode is not in TNC cache, load it from the media */
1200 znode = ubifs_load_znode(c, zbr, znode, *n);
1201 if (IS_ERR(znode))
1202 return PTR_ERR(znode);
1203 }
1204
1205 *zn = znode;
1206 if (exact || !is_hash_key(c, key) || *n != -1) {
1207 dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
1208 return exact;
1209 }
1210
1211 /*
1212 * Here is a tricky place. We have not found the key and this is a
1213 * "hashed" key, which may collide. The rest of the code deals with
1214 * situations like this:
1215 *
1216 * | 3 | 5 |
1217 * / \
1218 * | 3 | 5 | | 6 | 7 | (x)
1219 *
1220 * Or more a complex example:
1221 *
1222 * | 1 | 5 |
1223 * / \
1224 * | 1 | 3 | | 5 | 8 |
1225 * \ /
1226 * | 5 | 5 | | 6 | 7 | (x)
1227 *
1228 * In the examples, if we are looking for key "5", we may reach nodes
1229 * marked with "(x)". In this case what we have do is to look at the
1230 * left and see if there is "5" key there. If there is, we have to
1231 * return it.
1232 *
1233 * Note, this whole situation is possible because we allow to have
1234 * elements which are equivalent to the next key in the parent in the
1235 * children of current znode. For example, this happens if we split a
1236 * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
1237 * like this:
1238 * | 3 | 5 |
1239 * / \
1240 * | 3 | 5 | | 5 | 6 | 7 |
1241 * ^
1242 * And this becomes what is at the first "picture" after key "5" marked
1243 * with "^" is removed. What could be done is we could prohibit
1244 * splitting in the middle of the colliding sequence. Also, when
1245 * removing the leftmost key, we would have to correct the key of the
1246 * parent node, which would introduce additional complications. Namely,
1247 * if we changed the the leftmost key of the parent znode, the garbage
1248 * collector would be unable to find it (GC is doing this when GC'ing
1249 * indexing LEBs). Although we already have an additional RB-tree where
1250 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
1251 * after the commit. But anyway, this does not look easy to implement
1252 * so we did not try this.
1253 */
1254 err = tnc_prev(c, &znode, n);
1255 if (err == -ENOENT) {
1256 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1257 *n = -1;
1258 return 0;
1259 }
1260 if (unlikely(err < 0))
1261 return err;
1262 if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
1263 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1264 *n = -1;
1265 return 0;
1266 }
1267
1268 dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
1269 *zn = znode;
1270 return 1;
1271}
1272
1273/**
1274 * lookup_level0_dirty - search for zero-level znode dirtying.
1275 * @c: UBIFS file-system description object
1276 * @key: key to lookup
1277 * @zn: znode is returned here
1278 * @n: znode branch slot number is returned here
1279 *
1280 * This function looks up the TNC tree and search for zero-level znode which
1281 * refers key @key. The found zero-level znode is returned in @zn. There are 3
1282 * cases:
1283 * o exact match, i.e. the found zero-level znode contains key @key, then %1
1284 * is returned and slot number of the matched branch is stored in @n;
1285 * o not exact match, which means that zero-level znode does not contain @key
1286 * then %0 is returned and slot number of the closed branch is stored in
1287 * @n;
1288 * o @key is so small that it is even less than the lowest key of the
1289 * leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
1290 *
1291 * Additionally all znodes in the path from the root to the located zero-level
1292 * znode are marked as dirty.
1293 *
1294 * Note, when the TNC tree is traversed, some znodes may be absent, then this
1295 * function reads corresponding indexing nodes and inserts them to TNC. In
1296 * case of failure, a negative error code is returned.
1297 */
1298static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
1299 struct ubifs_znode **zn, int *n)
1300{
1301 int err, exact;
1302 struct ubifs_znode *znode;
1303 unsigned long time = get_seconds();
1304
1305 dbg_tnc("search and dirty key %s", DBGKEY(key));
1306
1307 znode = c->zroot.znode;
1308 if (unlikely(!znode)) {
1309 znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
1310 if (IS_ERR(znode))
1311 return PTR_ERR(znode);
1312 }
1313
1314 znode = dirty_cow_znode(c, &c->zroot);
1315 if (IS_ERR(znode))
1316 return PTR_ERR(znode);
1317
1318 znode->time = time;
1319
1320 while (1) {
1321 struct ubifs_zbranch *zbr;
1322
1323 exact = ubifs_search_zbranch(c, znode, key, n);
1324
1325 if (znode->level == 0)
1326 break;
1327
1328 if (*n < 0)
1329 *n = 0;
1330 zbr = &znode->zbranch[*n];
1331
1332 if (zbr->znode) {
1333 znode->time = time;
1334 znode = dirty_cow_znode(c, zbr);
1335 if (IS_ERR(znode))
1336 return PTR_ERR(znode);
1337 continue;
1338 }
1339
1340 /* znode is not in TNC cache, load it from the media */
1341 znode = ubifs_load_znode(c, zbr, znode, *n);
1342 if (IS_ERR(znode))
1343 return PTR_ERR(znode);
1344 znode = dirty_cow_znode(c, zbr);
1345 if (IS_ERR(znode))
1346 return PTR_ERR(znode);
1347 }
1348
1349 *zn = znode;
1350 if (exact || !is_hash_key(c, key) || *n != -1) {
1351 dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
1352 return exact;
1353 }
1354
1355 /*
1356 * See huge comment at 'lookup_level0_dirty()' what is the rest of the
1357 * code.
1358 */
1359 err = tnc_prev(c, &znode, n);
1360 if (err == -ENOENT) {
1361 *n = -1;
1362 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1363 return 0;
1364 }
1365 if (unlikely(err < 0))
1366 return err;
1367 if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
1368 *n = -1;
1369 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1370 return 0;
1371 }
1372
1373 if (znode->cnext || !ubifs_zn_dirty(znode)) {
1374 znode = dirty_cow_bottom_up(c, znode);
1375 if (IS_ERR(znode))
1376 return PTR_ERR(znode);
1377 }
1378
1379 dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
1380 *zn = znode;
1381 return 1;
1382}
1383
1384/**
1385 * ubifs_tnc_lookup - look up a file-system node.
1386 * @c: UBIFS file-system description object
1387 * @key: node key to lookup
1388 * @node: the node is returned here
1389 *
1390 * This function look up and reads node with key @key. The caller has to make
1391 * sure the @node buffer is large enough to fit the node. Returns zero in case
1392 * of success, %-ENOENT if the node was not found, and a negative error code in
1393 * case of failure.
1394 */
1395int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
1396 void *node)
1397{
1398 int found, n, err;
1399 struct ubifs_znode *znode;
1400 struct ubifs_zbranch zbr, *zt;
1401
1402 mutex_lock(&c->tnc_mutex);
1403 found = ubifs_lookup_level0(c, key, &znode, &n);
1404 if (!found) {
1405 err = -ENOENT;
1406 goto out;
1407 } else if (found < 0) {
1408 err = found;
1409 goto out;
1410 }
1411 zt = &znode->zbranch[n];
1412 if (is_hash_key(c, key)) {
1413 /*
1414 * In this case the leaf node cache gets used, so we pass the
1415 * address of the zbranch and keep the mutex locked
1416 */
1417 err = tnc_read_node_nm(c, zt, node);
1418 goto out;
1419 }
1420 zbr = znode->zbranch[n];
1421 mutex_unlock(&c->tnc_mutex);
1422
1423 err = ubifs_tnc_read_node(c, &zbr, node);
1424 return err;
1425
1426out:
1427 mutex_unlock(&c->tnc_mutex);
1428 return err;
1429}
1430
1431/**
1432 * ubifs_tnc_locate - look up a file-system node and return it and its location.
1433 * @c: UBIFS file-system description object
1434 * @key: node key to lookup
1435 * @node: the node is returned here
1436 * @lnum: LEB number is returned here
1437 * @offs: offset is returned here
1438 *
1439 * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
1440 * location also. See 'ubifs_tnc_lookup()'.
1441 */
1442int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1443 void *node, int *lnum, int *offs)
1444{
1445 int found, n, err;
1446 struct ubifs_znode *znode;
1447 struct ubifs_zbranch zbr, *zt;
1448
1449 mutex_lock(&c->tnc_mutex);
1450 found = ubifs_lookup_level0(c, key, &znode, &n);
1451 if (!found) {
1452 err = -ENOENT;
1453 goto out;
1454 } else if (found < 0) {
1455 err = found;
1456 goto out;
1457 }
1458 zt = &znode->zbranch[n];
1459 if (is_hash_key(c, key)) {
1460 /*
1461 * In this case the leaf node cache gets used, so we pass the
1462 * address of the zbranch and keep the mutex locked
1463 */
1464 *lnum = zt->lnum;
1465 *offs = zt->offs;
1466 err = tnc_read_node_nm(c, zt, node);
1467 goto out;
1468 }
1469 zbr = znode->zbranch[n];
1470 mutex_unlock(&c->tnc_mutex);
1471
1472 *lnum = zbr.lnum;
1473 *offs = zbr.offs;
1474
1475 err = ubifs_tnc_read_node(c, &zbr, node);
1476 return err;
1477
1478out:
1479 mutex_unlock(&c->tnc_mutex);
1480 return err;
1481}
1482
1483/**
1484 * do_lookup_nm- look up a "hashed" node.
1485 * @c: UBIFS file-system description object
1486 * @key: node key to lookup
1487 * @node: the node is returned here
1488 * @nm: node name
1489 *
1490 * This function look up and reads a node which contains name hash in the key.
1491 * Since the hash may have collisions, there may be many nodes with the same
1492 * key, so we have to sequentially look to all of them until the needed one is
1493 * found. This function returns zero in case of success, %-ENOENT if the node
1494 * was not found, and a negative error code in case of failure.
1495 */
1496static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1497 void *node, const struct qstr *nm)
1498{
1499 int found, n, err;
1500 struct ubifs_znode *znode;
1501 struct ubifs_zbranch zbr;
1502
1503 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
1504 mutex_lock(&c->tnc_mutex);
1505 found = ubifs_lookup_level0(c, key, &znode, &n);
1506 if (!found) {
1507 err = -ENOENT;
1508 goto out_unlock;
1509 } else if (found < 0) {
1510 err = found;
1511 goto out_unlock;
1512 }
1513
1514 ubifs_assert(n >= 0);
1515
1516 err = resolve_collision(c, key, &znode, &n, nm);
1517 dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
1518 if (unlikely(err < 0))
1519 goto out_unlock;
1520 if (err == 0) {
1521 err = -ENOENT;
1522 goto out_unlock;
1523 }
1524
1525 zbr = znode->zbranch[n];
1526 mutex_unlock(&c->tnc_mutex);
1527
1528 err = tnc_read_node_nm(c, &zbr, node);
1529 return err;
1530
1531out_unlock:
1532 mutex_unlock(&c->tnc_mutex);
1533 return err;
1534}
1535
1536/**
1537 * ubifs_tnc_lookup_nm - look up a "hashed" node.
1538 * @c: UBIFS file-system description object
1539 * @key: node key to lookup
1540 * @node: the node is returned here
1541 * @nm: node name
1542 *
1543 * This function look up and reads a node which contains name hash in the key.
1544 * Since the hash may have collisions, there may be many nodes with the same
1545 * key, so we have to sequentially look to all of them until the needed one is
1546 * found. This function returns zero in case of success, %-ENOENT if the node
1547 * was not found, and a negative error code in case of failure.
1548 */
1549int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1550 void *node, const struct qstr *nm)
1551{
1552 int err, len;
1553 const struct ubifs_dent_node *dent = node;
1554
1555 /*
1556 * We assume that in most of the cases there are no name collisions and
1557 * 'ubifs_tnc_lookup()' returns us the right direntry.
1558 */
1559 err = ubifs_tnc_lookup(c, key, node);
1560 if (err)
1561 return err;
1562
1563 len = le16_to_cpu(dent->nlen);
1564 if (nm->len == len && !memcmp(dent->name, nm->name, len))
1565 return 0;
1566
1567 /*
1568 * Unluckily, there are hash collisions and we have to iterate over
1569 * them look at each direntry with colliding name hash sequentially.
1570 */
1571 return do_lookup_nm(c, key, node, nm);
1572}
1573
1574/**
1575 * correct_parent_keys - correct parent znodes' keys.
1576 * @c: UBIFS file-system description object
1577 * @znode: znode to correct parent znodes for
1578 *
1579 * This is a helper function for 'tnc_insert()'. When the key of the leftmost
1580 * zbranch changes, keys of parent znodes have to be corrected. This helper
1581 * function is called in such situations and corrects the keys if needed.
1582 */
1583static void correct_parent_keys(const struct ubifs_info *c,
1584 struct ubifs_znode *znode)
1585{
1586 union ubifs_key *key, *key1;
1587
1588 ubifs_assert(znode->parent);
1589 ubifs_assert(znode->iip == 0);
1590
1591 key = &znode->zbranch[0].key;
1592 key1 = &znode->parent->zbranch[0].key;
1593
1594 while (keys_cmp(c, key, key1) < 0) {
1595 key_copy(c, key, key1);
1596 znode = znode->parent;
1597 znode->alt = 1;
1598 if (!znode->parent || znode->iip)
1599 break;
1600 key1 = &znode->parent->zbranch[0].key;
1601 }
1602}
1603
1604/**
1605 * insert_zbranch - insert a zbranch into a znode.
1606 * @znode: znode into which to insert
1607 * @zbr: zbranch to insert
1608 * @n: slot number to insert to
1609 *
1610 * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
1611 * znode's array of zbranches and keeps zbranches consolidated, so when a new
1612 * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
1613 * slot, zbranches starting from @n have to be moved right.
1614 */
1615static void insert_zbranch(struct ubifs_znode *znode,
1616 const struct ubifs_zbranch *zbr, int n)
1617{
1618 int i;
1619
1620 ubifs_assert(ubifs_zn_dirty(znode));
1621
1622 if (znode->level) {
1623 for (i = znode->child_cnt; i > n; i--) {
1624 znode->zbranch[i] = znode->zbranch[i - 1];
1625 if (znode->zbranch[i].znode)
1626 znode->zbranch[i].znode->iip = i;
1627 }
1628 if (zbr->znode)
1629 zbr->znode->iip = n;
1630 } else
1631 for (i = znode->child_cnt; i > n; i--)
1632 znode->zbranch[i] = znode->zbranch[i - 1];
1633
1634 znode->zbranch[n] = *zbr;
1635 znode->child_cnt += 1;
1636
1637 /*
1638 * After inserting at slot zero, the lower bound of the key range of
1639 * this znode may have changed. If this znode is subsequently split
1640 * then the upper bound of the key range may change, and furthermore
1641 * it could change to be lower than the original lower bound. If that
1642 * happens, then it will no longer be possible to find this znode in the
1643 * TNC using the key from the index node on flash. That is bad because
1644 * if it is not found, we will assume it is obsolete and may overwrite
1645 * it. Then if there is an unclean unmount, we will start using the
1646 * old index which will be broken.
1647 *
1648 * So we first mark znodes that have insertions at slot zero, and then
1649 * if they are split we add their lnum/offs to the old_idx tree.
1650 */
1651 if (n == 0)
1652 znode->alt = 1;
1653}
1654
1655/**
1656 * tnc_insert - insert a node into TNC.
1657 * @c: UBIFS file-system description object
1658 * @znode: znode to insert into
1659 * @zbr: branch to insert
1660 * @n: slot number to insert new zbranch to
1661 *
1662 * This function inserts a new node described by @zbr into znode @znode. If
1663 * znode does not have a free slot for new zbranch, it is split. Parent znodes
1664 * are splat as well if needed. Returns zero in case of success or a negative
1665 * error code in case of failure.
1666 */
1667static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
1668 struct ubifs_zbranch *zbr, int n)
1669{
1670 struct ubifs_znode *zn, *zi, *zp;
1671 int i, keep, move, appending = 0;
1672 union ubifs_key *key = &zbr->key;
1673
1674 ubifs_assert(n >= 0 && n <= c->fanout);
1675
1676 /* Implement naive insert for now */
1677again:
1678 zp = znode->parent;
1679 if (znode->child_cnt < c->fanout) {
1680 ubifs_assert(n != c->fanout);
1681 dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
1682 DBGKEY(key));
1683
1684 insert_zbranch(znode, zbr, n);
1685
1686 /* Ensure parent's key is correct */
1687 if (n == 0 && zp && znode->iip == 0)
1688 correct_parent_keys(c, znode);
1689
1690 return 0;
1691 }
1692
1693 /*
1694 * Unfortunately, @znode does not have more empty slots and we have to
1695 * split it.
1696 */
1697 dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
1698
1699 if (znode->alt)
1700 /*
1701 * We can no longer be sure of finding this znode by key, so we
1702 * record it in the old_idx tree.
1703 */
1704 ins_clr_old_idx_znode(c, znode);
1705
1706 zn = kzalloc(c->max_znode_sz, GFP_NOFS);
1707 if (!zn)
1708 return -ENOMEM;
1709 zn->parent = zp;
1710 zn->level = znode->level;
1711
1712 /* Decide where to split */
1713 if (znode->level == 0 && n == c->fanout &&
1714 key_type(c, key) == UBIFS_DATA_KEY) {
1715 union ubifs_key *key1;
1716
1717 /*
1718 * If this is an inode which is being appended - do not split
1719 * it because no other zbranches can be inserted between
1720 * zbranches of consecutive data nodes anyway.
1721 */
1722 key1 = &znode->zbranch[n - 1].key;
1723 if (key_inum(c, key1) == key_inum(c, key) &&
1724 key_type(c, key1) == UBIFS_DATA_KEY &&
1725 key_block(c, key1) == key_block(c, key) - 1)
1726 appending = 1;
1727 }
1728
1729 if (appending) {
1730 keep = c->fanout;
1731 move = 0;
1732 } else {
1733 keep = (c->fanout + 1) / 2;
1734 move = c->fanout - keep;
1735 }
1736
1737 /*
1738 * Although we don't at present, we could look at the neighbors and see
1739 * if we can move some zbranches there.
1740 */
1741
1742 if (n < keep) {
1743 /* Insert into existing znode */
1744 zi = znode;
1745 move += 1;
1746 keep -= 1;
1747 } else {
1748 /* Insert into new znode */
1749 zi = zn;
1750 n -= keep;
1751 /* Re-parent */
1752 if (zn->level != 0)
1753 zbr->znode->parent = zn;
1754 }
1755
1756 __set_bit(DIRTY_ZNODE, &zn->flags);
1757 atomic_long_inc(&c->dirty_zn_cnt);
1758
1759 zn->child_cnt = move;
1760 znode->child_cnt = keep;
1761
1762 dbg_tnc("moving %d, keeping %d", move, keep);
1763
1764 /* Move zbranch */
1765 for (i = 0; i < move; i++) {
1766 zn->zbranch[i] = znode->zbranch[keep + i];
1767 /* Re-parent */
1768 if (zn->level != 0)
1769 if (zn->zbranch[i].znode) {
1770 zn->zbranch[i].znode->parent = zn;
1771 zn->zbranch[i].znode->iip = i;
1772 }
1773 }
1774
1775 /* Insert new key and branch */
1776 dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
1777
1778 insert_zbranch(zi, zbr, n);
1779
1780 /* Insert new znode (produced by spitting) into the parent */
1781 if (zp) {
1782 i = n;
1783 /* Locate insertion point */
1784 n = znode->iip + 1;
1785 if (appending && n != c->fanout)
1786 appending = 0;
1787
1788 if (i == 0 && zi == znode && znode->iip == 0)
1789 correct_parent_keys(c, znode);
1790
1791 /* Tail recursion */
1792 zbr->key = zn->zbranch[0].key;
1793 zbr->znode = zn;
1794 zbr->lnum = 0;
1795 zbr->offs = 0;
1796 zbr->len = 0;
1797 znode = zp;
1798
1799 goto again;
1800 }
1801
1802 /* We have to split root znode */
1803 dbg_tnc("creating new zroot at level %d", znode->level + 1);
1804
1805 zi = kzalloc(c->max_znode_sz, GFP_NOFS);
1806 if (!zi)
1807 return -ENOMEM;
1808
1809 zi->child_cnt = 2;
1810 zi->level = znode->level + 1;
1811
1812 __set_bit(DIRTY_ZNODE, &zi->flags);
1813 atomic_long_inc(&c->dirty_zn_cnt);
1814
1815 zi->zbranch[0].key = znode->zbranch[0].key;
1816 zi->zbranch[0].znode = znode;
1817 zi->zbranch[0].lnum = c->zroot.lnum;
1818 zi->zbranch[0].offs = c->zroot.offs;
1819 zi->zbranch[0].len = c->zroot.len;
1820 zi->zbranch[1].key = zn->zbranch[0].key;
1821 zi->zbranch[1].znode = zn;
1822
1823 c->zroot.lnum = 0;
1824 c->zroot.offs = 0;
1825 c->zroot.len = 0;
1826 c->zroot.znode = zi;
1827
1828 zn->parent = zi;
1829 zn->iip = 1;
1830 znode->parent = zi;
1831 znode->iip = 0;
1832
1833 return 0;
1834}
1835
1836/**
1837 * ubifs_tnc_add - add a node to TNC.
1838 * @c: UBIFS file-system description object
1839 * @key: key to add
1840 * @lnum: LEB number of node
1841 * @offs: node offset
1842 * @len: node length
1843 *
1844 * This function adds a node with key @key to TNC. The node may be new or it may
1845 * obsolete some existing one. Returns %0 on success or negative error code on
1846 * failure.
1847 */
1848int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
1849 int offs, int len)
1850{
1851 int found, n, err = 0;
1852 struct ubifs_znode *znode;
1853
1854 mutex_lock(&c->tnc_mutex);
1855 dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
1856 found = lookup_level0_dirty(c, key, &znode, &n);
1857 if (!found) {
1858 struct ubifs_zbranch zbr;
1859
1860 zbr.znode = NULL;
1861 zbr.lnum = lnum;
1862 zbr.offs = offs;
1863 zbr.len = len;
1864 key_copy(c, key, &zbr.key);
1865 err = tnc_insert(c, znode, &zbr, n + 1);
1866 } else if (found == 1) {
1867 struct ubifs_zbranch *zbr = &znode->zbranch[n];
1868
1869 lnc_free(zbr);
1870 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
1871 zbr->lnum = lnum;
1872 zbr->offs = offs;
1873 zbr->len = len;
1874 } else
1875 err = found;
1876 if (!err)
1877 err = dbg_check_tnc(c, 0);
1878 mutex_unlock(&c->tnc_mutex);
1879
1880 return err;
1881}
1882
1883/**
1884 * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
1885 * @c: UBIFS file-system description object
1886 * @key: key to add
1887 * @old_lnum: LEB number of old node
1888 * @old_offs: old node offset
1889 * @lnum: LEB number of node
1890 * @offs: node offset
1891 * @len: node length
1892 *
1893 * This function replaces a node with key @key in the TNC only if the old node
1894 * is found. This function is called by garbage collection when node are moved.
1895 * Returns %0 on success or negative error code on failure.
1896 */
1897int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
1898 int old_lnum, int old_offs, int lnum, int offs, int len)
1899{
1900 int found, n, err = 0;
1901 struct ubifs_znode *znode;
1902
1903 mutex_lock(&c->tnc_mutex);
1904 dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
1905 old_offs, lnum, offs, len, DBGKEY(key));
1906 found = lookup_level0_dirty(c, key, &znode, &n);
1907 if (found < 0) {
1908 err = found;
1909 goto out_unlock;
1910 }
1911
1912 if (found == 1) {
1913 struct ubifs_zbranch *zbr = &znode->zbranch[n];
1914
1915 found = 0;
1916 if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
1917 lnc_free(zbr);
1918 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
1919 if (err)
1920 goto out_unlock;
1921 zbr->lnum = lnum;
1922 zbr->offs = offs;
1923 zbr->len = len;
1924 found = 1;
1925 } else if (is_hash_key(c, key)) {
1926 found = resolve_collision_directly(c, key, &znode, &n,
1927 old_lnum, old_offs);
1928 dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
1929 found, znode, n, old_lnum, old_offs);
1930 if (found < 0) {
1931 err = found;
1932 goto out_unlock;
1933 }
1934
1935 if (found) {
1936 /* Ensure the znode is dirtied */
1937 if (znode->cnext || !ubifs_zn_dirty(znode)) {
1938 znode = dirty_cow_bottom_up(c,
1939 znode);
1940 if (IS_ERR(znode)) {
1941 err = PTR_ERR(znode);
1942 goto out_unlock;
1943 }
1944 }
1945 zbr = &znode->zbranch[n];
1946 lnc_free(zbr);
1947 err = ubifs_add_dirt(c, zbr->lnum,
1948 zbr->len);
1949 if (err)
1950 goto out_unlock;
1951 zbr->lnum = lnum;
1952 zbr->offs = offs;
1953 zbr->len = len;
1954 }
1955 }
1956 }
1957
1958 if (!found)
1959 err = ubifs_add_dirt(c, lnum, len);
1960
1961 if (!err)
1962 err = dbg_check_tnc(c, 0);
1963
1964out_unlock:
1965 mutex_unlock(&c->tnc_mutex);
1966 return err;
1967}
1968
1969/**
1970 * ubifs_tnc_add_nm - add a "hashed" node to TNC.
1971 * @c: UBIFS file-system description object
1972 * @key: key to add
1973 * @lnum: LEB number of node
1974 * @offs: node offset
1975 * @len: node length
1976 * @nm: node name
1977 *
1978 * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
1979 * may have collisions, like directory entry keys.
1980 */
1981int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
1982 int lnum, int offs, int len, const struct qstr *nm)
1983{
1984 int found, n, err = 0;
1985 struct ubifs_znode *znode;
1986
1987 mutex_lock(&c->tnc_mutex);
1988 dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
1989 DBGKEY(key));
1990 found = lookup_level0_dirty(c, key, &znode, &n);
1991 if (found < 0) {
1992 err = found;
1993 goto out_unlock;
1994 }
1995
1996 if (found == 1) {
1997 if (c->replaying)
1998 found = fallible_resolve_collision(c, key, &znode, &n,
1999 nm, 1);
2000 else
2001 found = resolve_collision(c, key, &znode, &n, nm);
2002 dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
2003 if (found < 0) {
2004 err = found;
2005 goto out_unlock;
2006 }
2007
2008 /* Ensure the znode is dirtied */
2009 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2010 znode = dirty_cow_bottom_up(c, znode);
2011 if (IS_ERR(znode)) {
2012 err = PTR_ERR(znode);
2013 goto out_unlock;
2014 }
2015 }
2016
2017 if (found == 1) {
2018 struct ubifs_zbranch *zbr = &znode->zbranch[n];
2019
2020 lnc_free(zbr);
2021 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
2022 zbr->lnum = lnum;
2023 zbr->offs = offs;
2024 zbr->len = len;
2025 goto out_unlock;
2026 }
2027 }
2028
2029 if (!found) {
2030 struct ubifs_zbranch zbr;
2031
2032 zbr.znode = NULL;
2033 zbr.lnum = lnum;
2034 zbr.offs = offs;
2035 zbr.len = len;
2036 key_copy(c, key, &zbr.key);
2037 err = tnc_insert(c, znode, &zbr, n + 1);
2038 if (err)
2039 goto out_unlock;
2040 if (c->replaying) {
2041 /*
2042 * We did not find it in the index so there may be a
2043 * dangling branch still in the index. So we remove it
2044 * by passing 'ubifs_tnc_remove_nm()' the same key but
2045 * an unmatchable name.
2046 */
2047 struct qstr noname = { .len = 0, .name = "" };
2048
2049 err = dbg_check_tnc(c, 0);
2050 mutex_unlock(&c->tnc_mutex);
2051 if (err)
2052 return err;
2053 return ubifs_tnc_remove_nm(c, key, &noname);
2054 }
2055 }
2056
2057out_unlock:
2058 if (!err)
2059 err = dbg_check_tnc(c, 0);
2060 mutex_unlock(&c->tnc_mutex);
2061 return err;
2062}
2063
2064/**
2065 * tnc_delete - delete a znode form TNC.
2066 * @c: UBIFS file-system description object
2067 * @znode: znode to delete from
2068 * @n: zbranch slot number to delete
2069 *
2070 * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
2071 * case of success and a negative error code in case of failure.
2072 */
2073static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
2074{
2075 struct ubifs_zbranch *zbr;
2076 struct ubifs_znode *zp;
2077 int i, err;
2078
2079 /* Delete without merge for now */
2080 ubifs_assert(znode->level == 0);
2081 ubifs_assert(n >= 0 && n < c->fanout);
2082 dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
2083
2084 zbr = &znode->zbranch[n];
2085 lnc_free(zbr);
2086
2087 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
2088 if (err) {
2089 dbg_dump_znode(c, znode);
2090 return err;
2091 }
2092
2093 /* We do not "gap" zbranch slots */
2094 for (i = n; i < znode->child_cnt - 1; i++)
2095 znode->zbranch[i] = znode->zbranch[i + 1];
2096 znode->child_cnt -= 1;
2097
2098 if (znode->child_cnt > 0)
2099 return 0;
2100
2101 /*
2102 * This was the last zbranch, we have to delete this znode from the
2103 * parent.
2104 */
2105
2106 do {
2107 ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
2108 ubifs_assert(ubifs_zn_dirty(znode));
2109
2110 zp = znode->parent;
2111 n = znode->iip;
2112
2113 atomic_long_dec(&c->dirty_zn_cnt);
2114
2115 err = insert_old_idx_znode(c, znode);
2116 if (err)
2117 return err;
2118
2119 if (znode->cnext) {
2120 __set_bit(OBSOLETE_ZNODE, &znode->flags);
2121 atomic_long_inc(&c->clean_zn_cnt);
2122 atomic_long_inc(&ubifs_clean_zn_cnt);
2123 } else
2124 kfree(znode);
2125 znode = zp;
2126 } while (znode->child_cnt == 1); /* while removing last child */
2127
2128 /* Remove from znode, entry n - 1 */
2129 znode->child_cnt -= 1;
2130 ubifs_assert(znode->level != 0);
2131 for (i = n; i < znode->child_cnt; i++) {
2132 znode->zbranch[i] = znode->zbranch[i + 1];
2133 if (znode->zbranch[i].znode)
2134 znode->zbranch[i].znode->iip = i;
2135 }
2136
2137 /*
2138 * If this is the root and it has only 1 child then
2139 * collapse the tree.
2140 */
2141 if (!znode->parent) {
2142 while (znode->child_cnt == 1 && znode->level != 0) {
2143 zp = znode;
2144 zbr = &znode->zbranch[0];
2145 znode = get_znode(c, znode, 0);
2146 if (IS_ERR(znode))
2147 return PTR_ERR(znode);
2148 znode = dirty_cow_znode(c, zbr);
2149 if (IS_ERR(znode))
2150 return PTR_ERR(znode);
2151 znode->parent = NULL;
2152 znode->iip = 0;
2153 if (c->zroot.len) {
2154 err = insert_old_idx(c, c->zroot.lnum,
2155 c->zroot.offs);
2156 if (err)
2157 return err;
2158 }
2159 c->zroot.lnum = zbr->lnum;
2160 c->zroot.offs = zbr->offs;
2161 c->zroot.len = zbr->len;
2162 c->zroot.znode = znode;
2163 ubifs_assert(!test_bit(OBSOLETE_ZNODE,
2164 &zp->flags));
2165 ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
2166 atomic_long_dec(&c->dirty_zn_cnt);
2167
2168 if (zp->cnext) {
2169 __set_bit(OBSOLETE_ZNODE, &zp->flags);
2170 atomic_long_inc(&c->clean_zn_cnt);
2171 atomic_long_inc(&ubifs_clean_zn_cnt);
2172 } else
2173 kfree(zp);
2174 }
2175 }
2176
2177 return 0;
2178}
2179
2180/**
2181 * ubifs_tnc_remove - remove an index entry of a node.
2182 * @c: UBIFS file-system description object
2183 * @key: key of node
2184 *
2185 * Returns %0 on success or negative error code on failure.
2186 */
2187int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
2188{
2189 int found, n, err = 0;
2190 struct ubifs_znode *znode;
2191
2192 mutex_lock(&c->tnc_mutex);
2193 dbg_tnc("key %s", DBGKEY(key));
2194 found = lookup_level0_dirty(c, key, &znode, &n);
2195 if (found < 0) {
2196 err = found;
2197 goto out_unlock;
2198 }
2199 if (found == 1)
2200 err = tnc_delete(c, znode, n);
2201 if (!err)
2202 err = dbg_check_tnc(c, 0);
2203
2204out_unlock:
2205 mutex_unlock(&c->tnc_mutex);
2206 return err;
2207}
2208
2209/**
2210 * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
2211 * @c: UBIFS file-system description object
2212 * @key: key of node
2213 * @nm: directory entry name
2214 *
2215 * Returns %0 on success or negative error code on failure.
2216 */
2217int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
2218 const struct qstr *nm)
2219{
2220 int n, err;
2221 struct ubifs_znode *znode;
2222
2223 mutex_lock(&c->tnc_mutex);
2224 dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
2225 err = lookup_level0_dirty(c, key, &znode, &n);
2226 if (err < 0)
2227 goto out_unlock;
2228
2229 if (err) {
2230 if (c->replaying)
2231 err = fallible_resolve_collision(c, key, &znode, &n,
2232 nm, 0);
2233 else
2234 err = resolve_collision(c, key, &znode, &n, nm);
2235 dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
2236 if (err < 0)
2237 goto out_unlock;
2238 if (err) {
2239 /* Ensure the znode is dirtied */
2240 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2241 znode = dirty_cow_bottom_up(c, znode);
2242 if (IS_ERR(znode)) {
2243 err = PTR_ERR(znode);
2244 goto out_unlock;
2245 }
2246 }
2247 err = tnc_delete(c, znode, n);
2248 }
2249 }
2250
2251out_unlock:
2252 if (!err)
2253 err = dbg_check_tnc(c, 0);
2254 mutex_unlock(&c->tnc_mutex);
2255 return err;
2256}
2257
2258/**
2259 * key_in_range - determine if a key falls within a range of keys.
2260 * @c: UBIFS file-system description object
2261 * @key: key to check
2262 * @from_key: lowest key in range
2263 * @to_key: highest key in range
2264 *
2265 * This function returns %1 if the key is in range and %0 otherwise.
2266 */
2267static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
2268 union ubifs_key *from_key, union ubifs_key *to_key)
2269{
2270 if (keys_cmp(c, key, from_key) < 0)
2271 return 0;
2272 if (keys_cmp(c, key, to_key) > 0)
2273 return 0;
2274 return 1;
2275}
2276
2277/**
2278 * ubifs_tnc_remove_range - remove index entries in range.
2279 * @c: UBIFS file-system description object
2280 * @from_key: lowest key to remove
2281 * @to_key: highest key to remove
2282 *
2283 * This function removes index entries starting at @from_key and ending at
2284 * @to_key. This function returns zero in case of success and a negative error
2285 * code in case of failure.
2286 */
2287int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
2288 union ubifs_key *to_key)
2289{
2290 int i, n, k, err = 0;
2291 struct ubifs_znode *znode;
2292 union ubifs_key *key;
2293
2294 mutex_lock(&c->tnc_mutex);
2295 while (1) {
2296 /* Find first level 0 znode that contains keys to remove */
2297 err = ubifs_lookup_level0(c, from_key, &znode, &n);
2298 if (err < 0)
2299 goto out_unlock;
2300
2301 if (err)
2302 key = from_key;
2303 else {
2304 err = tnc_next(c, &znode, &n);
2305 if (err == -ENOENT) {
2306 err = 0;
2307 goto out_unlock;
2308 }
2309 if (err < 0)
2310 goto out_unlock;
2311 key = &znode->zbranch[n].key;
2312 if (!key_in_range(c, key, from_key, to_key)) {
2313 err = 0;
2314 goto out_unlock;
2315 }
2316 }
2317
2318 /* Ensure the znode is dirtied */
2319 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2320 znode = dirty_cow_bottom_up(c, znode);
2321 if (IS_ERR(znode)) {
2322 err = PTR_ERR(znode);
2323 goto out_unlock;
2324 }
2325 }
2326
2327 /* Remove all keys in range except the first */
2328 for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
2329 key = &znode->zbranch[i].key;
2330 if (!key_in_range(c, key, from_key, to_key))
2331 break;
2332 lnc_free(&znode->zbranch[i]);
2333 err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
2334 znode->zbranch[i].len);
2335 if (err) {
2336 dbg_dump_znode(c, znode);
2337 goto out_unlock;
2338 }
2339 dbg_tnc("removing %s", DBGKEY(key));
2340 }
2341 if (k) {
2342 for (i = n + 1 + k; i < znode->child_cnt; i++)
2343 znode->zbranch[i - k] = znode->zbranch[i];
2344 znode->child_cnt -= k;
2345 }
2346
2347 /* Now delete the first */
2348 err = tnc_delete(c, znode, n);
2349 if (err)
2350 goto out_unlock;
2351 }
2352
2353out_unlock:
2354 if (!err)
2355 err = dbg_check_tnc(c, 0);
2356 mutex_unlock(&c->tnc_mutex);
2357 return err;
2358}
2359
2360/**
2361 * ubifs_tnc_remove_ino - remove an inode from TNC.
2362 * @c: UBIFS file-system description object
2363 * @inum: inode number to remove
2364 *
2365 * This function remove inode @inum and all the extended attributes associated
2366 * with the anode from TNC and returns zero in case of success or a negative
2367 * error code in case of failure.
2368 */
2369int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
2370{
2371 union ubifs_key key1, key2;
2372 struct ubifs_dent_node *xent, *pxent = NULL;
2373 struct qstr nm = { .name = NULL };
2374
2375 dbg_tnc("ino %lu", inum);
2376
2377 /*
2378 * Walk all extended attribute entries and remove them together with
2379 * corresponding extended attribute inodes.
2380 */
2381 lowest_xent_key(c, &key1, inum);
2382 while (1) {
2383 ino_t xattr_inum;
2384 int err;
2385
2386 xent = ubifs_tnc_next_ent(c, &key1, &nm);
2387 if (IS_ERR(xent)) {
2388 err = PTR_ERR(xent);
2389 if (err == -ENOENT)
2390 break;
2391 return err;
2392 }
2393
2394 xattr_inum = le64_to_cpu(xent->inum);
2395 dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
2396
2397 nm.name = xent->name;
2398 nm.len = le16_to_cpu(xent->nlen);
2399 err = ubifs_tnc_remove_nm(c, &key1, &nm);
2400 if (err) {
2401 kfree(xent);
2402 return err;
2403 }
2404
2405 lowest_ino_key(c, &key1, xattr_inum);
2406 highest_ino_key(c, &key2, xattr_inum);
2407 err = ubifs_tnc_remove_range(c, &key1, &key2);
2408 if (err) {
2409 kfree(xent);
2410 return err;
2411 }
2412
2413 kfree(pxent);
2414 pxent = xent;
2415 key_read(c, &xent->key, &key1);
2416 }
2417
2418 kfree(pxent);
2419 lowest_ino_key(c, &key1, inum);
2420 highest_ino_key(c, &key2, inum);
2421
2422 return ubifs_tnc_remove_range(c, &key1, &key2);
2423}
2424
2425/**
2426 * ubifs_tnc_next_ent - walk directory or extended attribute entries.
2427 * @c: UBIFS file-system description object
2428 * @key: key of last entry
2429 * @nm: name of last entry found or %NULL
2430 *
2431 * This function finds and reads the next directory or extended attribute entry
2432 * after the given key (@key) if there is one. @nm is used to resolve
2433 * collisions.
2434 *
2435 * If the name of the current entry is not known and only the key is known,
2436 * @nm->name has to be %NULL. In this case the semantics of this function is a
2437 * little bit different and it returns the entry corresponding to this key, not
2438 * the next one. If the key was not found, the closest "right" entry is
2439 * returned.
2440 *
2441 * If the fist entry has to be found, @key has to contain the lowest possible
2442 * key value for this inode and @name has to be %NULL.
2443 *
2444 * This function returns the found directory or extended attribute entry node
2445 * in case of success, %-ENOENT is returned if no entry was found, and a
2446 * negative error code is returned in case of failure.
2447 */
2448struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
2449 union ubifs_key *key,
2450 const struct qstr *nm)
2451{
2452 int n, err, type = key_type(c, key);
2453 struct ubifs_znode *znode;
2454 struct ubifs_dent_node *dent;
2455 struct ubifs_zbranch *zbr;
2456 union ubifs_key *dkey;
2457
2458 dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
2459 ubifs_assert(is_hash_key(c, key));
2460
2461 mutex_lock(&c->tnc_mutex);
2462 err = ubifs_lookup_level0(c, key, &znode, &n);
2463 if (unlikely(err < 0))
2464 goto out_unlock;
2465
2466 if (nm->name) {
2467 if (err) {
2468 /* Handle collisions */
2469 err = resolve_collision(c, key, &znode, &n, nm);
2470 dbg_tnc("rc returned %d, znode %p, n %d",
2471 err, znode, n);
2472 if (unlikely(err < 0))
2473 goto out_unlock;
2474 }
2475
2476 /* Now find next entry */
2477 err = tnc_next(c, &znode, &n);
2478 if (unlikely(err))
2479 goto out_unlock;
2480 } else {
2481 /*
2482 * The full name of the entry was not given, in which case the
2483 * behavior of this function is a little different and it
2484 * returns current entry, not the next one.
2485 */
2486 if (!err) {
2487 /*
2488 * However, the given key does not exist in the TNC
2489 * tree and @znode/@n variables contain the closest
2490 * "preceding" element. Switch to the next one.
2491 */
2492 err = tnc_next(c, &znode, &n);
2493 if (err)
2494 goto out_unlock;
2495 }
2496 }
2497
2498 zbr = &znode->zbranch[n];
2499 dent = kmalloc(zbr->len, GFP_NOFS);
2500 if (unlikely(!dent)) {
2501 err = -ENOMEM;
2502 goto out_unlock;
2503 }
2504
2505 /*
2506 * The above 'tnc_next()' call could lead us to the next inode, check
2507 * this.
2508 */
2509 dkey = &zbr->key;
2510 if (key_inum(c, dkey) != key_inum(c, key) ||
2511 key_type(c, dkey) != type) {
2512 err = -ENOENT;
2513 goto out_free;
2514 }
2515
2516 err = tnc_read_node_nm(c, zbr, dent);
2517 if (unlikely(err))
2518 goto out_free;
2519
2520 mutex_unlock(&c->tnc_mutex);
2521 return dent;
2522
2523out_free:
2524 kfree(dent);
2525out_unlock:
2526 mutex_unlock(&c->tnc_mutex);
2527 return ERR_PTR(err);
2528}
2529
2530/**
2531 * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
2532 * @c: UBIFS file-system description object
2533 *
2534 * Destroy left-over obsolete znodes from a failed commit.
2535 */
2536static void tnc_destroy_cnext(struct ubifs_info *c)
2537{
2538 struct ubifs_znode *cnext;
2539
2540 if (!c->cnext)
2541 return;
2542 ubifs_assert(c->cmt_state == COMMIT_BROKEN);
2543 cnext = c->cnext;
2544 do {
2545 struct ubifs_znode *znode = cnext;
2546
2547 cnext = cnext->cnext;
2548 if (test_bit(OBSOLETE_ZNODE, &znode->flags))
2549 kfree(znode);
2550 } while (cnext && cnext != c->cnext);
2551}
2552
2553/**
2554 * ubifs_tnc_close - close TNC subsystem and free all related resources.
2555 * @c: UBIFS file-system description object
2556 */
2557void ubifs_tnc_close(struct ubifs_info *c)
2558{
2559 long clean_freed;
2560
2561 tnc_destroy_cnext(c);
2562 if (c->zroot.znode) {
2563 clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
2564 atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
2565 }
2566 kfree(c->gap_lebs);
2567 kfree(c->ilebs);
2568 destroy_old_idx(c);
2569}
2570
2571/**
2572 * left_znode - get the znode to the left.
2573 * @c: UBIFS file-system description object
2574 * @znode: znode
2575 *
2576 * This function returns a pointer to the znode to the left of @znode or NULL if
2577 * there is not one. A negative error code is returned on failure.
2578 */
2579static struct ubifs_znode *left_znode(struct ubifs_info *c,
2580 struct ubifs_znode *znode)
2581{
2582 int level = znode->level;
2583
2584 while (1) {
2585 int n = znode->iip - 1;
2586
2587 /* Go up until we can go left */
2588 znode = znode->parent;
2589 if (!znode)
2590 return NULL;
2591 if (n >= 0) {
2592 /* Now go down the rightmost branch to 'level' */
2593 znode = get_znode(c, znode, n);
2594 if (IS_ERR(znode))
2595 return znode;
2596 while (znode->level != level) {
2597 n = znode->child_cnt - 1;
2598 znode = get_znode(c, znode, n);
2599 if (IS_ERR(znode))
2600 return znode;
2601 }
2602 break;
2603 }
2604 }
2605 return znode;
2606}
2607
2608/**
2609 * right_znode - get the znode to the right.
2610 * @c: UBIFS file-system description object
2611 * @znode: znode
2612 *
2613 * This function returns a pointer to the znode to the right of @znode or NULL
2614 * if there is not one. A negative error code is returned on failure.
2615 */
2616static struct ubifs_znode *right_znode(struct ubifs_info *c,
2617 struct ubifs_znode *znode)
2618{
2619 int level = znode->level;
2620
2621 while (1) {
2622 int n = znode->iip + 1;
2623
2624 /* Go up until we can go right */
2625 znode = znode->parent;
2626 if (!znode)
2627 return NULL;
2628 if (n < znode->child_cnt) {
2629 /* Now go down the leftmost branch to 'level' */
2630 znode = get_znode(c, znode, n);
2631 if (IS_ERR(znode))
2632 return znode;
2633 while (znode->level != level) {
2634 znode = get_znode(c, znode, 0);
2635 if (IS_ERR(znode))
2636 return znode;
2637 }
2638 break;
2639 }
2640 }
2641 return znode;
2642}
2643
2644/**
2645 * lookup_znode - find a particular indexing node from TNC.
2646 * @c: UBIFS file-system description object
2647 * @key: index node key to lookup
2648 * @level: index node level
2649 * @lnum: index node LEB number
2650 * @offs: index node offset
2651 *
2652 * This function searches an indexing node by its first key @key and its
2653 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
2654 * nodes it traverses to TNC. This function is called fro indexing nodes which
2655 * were found on the media by scanning, for example when garbage-collecting or
2656 * when doing in-the-gaps commit. This means that the indexing node which is
2657 * looked for does not have to have exactly the same leftmost key @key, because
2658 * the leftmost key may have been changed, in which case TNC will contain a
2659 * dirty znode which still refers the same @lnum:@offs. This function is clever
2660 * enough to recognize such indexing nodes.
2661 *
2662 * Note, if a znode was deleted or changed too much, then this function will
2663 * not find it. For situations like this UBIFS has the old index RB-tree
2664 * (indexed by @lnum:@offs).
2665 *
2666 * This function returns a pointer to the znode found or %NULL if it is not
2667 * found. A negative error code is returned on failure.
2668 */
2669static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
2670 union ubifs_key *key, int level,
2671 int lnum, int offs)
2672{
2673 struct ubifs_znode *znode, *zn;
2674 int n, nn;
2675
2676 /*
2677 * The arguments have probably been read off flash, so don't assume
2678 * they are valid.
2679 */
2680 if (level < 0)
2681 return ERR_PTR(-EINVAL);
2682
2683 /* Get the root znode */
2684 znode = c->zroot.znode;
2685 if (!znode) {
2686 znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
2687 if (IS_ERR(znode))
2688 return znode;
2689 }
2690 /* Check if it is the one we are looking for */
2691 if (c->zroot.lnum == lnum && c->zroot.offs == offs)
2692 return znode;
2693 /* Descend to the parent level i.e. (level + 1) */
2694 if (level >= znode->level)
2695 return NULL;
2696 while (1) {
2697 ubifs_search_zbranch(c, znode, key, &n);
2698 if (n < 0) {
2699 /*
2700 * We reached a znode where the leftmost key is greater
2701 * than the key we are searching for. This is the same
2702 * situation as the one described in a huge comment at
2703 * the end of the 'ubifs_lookup_level0()' function. And
2704 * for exactly the same reasons we have to try to look
2705 * left before giving up.
2706 */
2707 znode = left_znode(c, znode);
2708 if (!znode)
2709 return NULL;
2710 if (IS_ERR(znode))
2711 return znode;
2712 ubifs_search_zbranch(c, znode, key, &n);
2713 ubifs_assert(n >= 0);
2714 }
2715 if (znode->level == level + 1)
2716 break;
2717 znode = get_znode(c, znode, n);
2718 if (IS_ERR(znode))
2719 return znode;
2720 }
2721 /* Check if the child is the one we are looking for */
2722 if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
2723 return get_znode(c, znode, n);
2724 /* If the key is unique, there is nowhere else to look */
2725 if (!is_hash_key(c, key))
2726 return NULL;
2727 /*
2728 * The key is not unique and so may be also in the znodes to either
2729 * side.
2730 */
2731 zn = znode;
2732 nn = n;
2733 /* Look left */
2734 while (1) {
2735 /* Move one branch to the left */
2736 if (n)
2737 n -= 1;
2738 else {
2739 znode = left_znode(c, znode);
2740 if (!znode)
2741 break;
2742 if (IS_ERR(znode))
2743 return znode;
2744 n = znode->child_cnt - 1;
2745 }
2746 /* Check it */
2747 if (znode->zbranch[n].lnum == lnum &&
2748 znode->zbranch[n].offs == offs)
2749 return get_znode(c, znode, n);
2750 /* Stop if the key is less than the one we are looking for */
2751 if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
2752 break;
2753 }
2754 /* Back to the middle */
2755 znode = zn;
2756 n = nn;
2757 /* Look right */
2758 while (1) {
2759 /* Move one branch to the right */
2760 if (++n >= znode->child_cnt) {
2761 znode = right_znode(c, znode);
2762 if (!znode)
2763 break;
2764 if (IS_ERR(znode))
2765 return znode;
2766 n = 0;
2767 }
2768 /* Check it */
2769 if (znode->zbranch[n].lnum == lnum &&
2770 znode->zbranch[n].offs == offs)
2771 return get_znode(c, znode, n);
2772 /* Stop if the key is greater than the one we are looking for */
2773 if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
2774 break;
2775 }
2776 return NULL;
2777}
2778
2779/**
2780 * is_idx_node_in_tnc - determine if an index node is in the TNC.
2781 * @c: UBIFS file-system description object
2782 * @key: key of index node
2783 * @level: index node level
2784 * @lnum: LEB number of index node
2785 * @offs: offset of index node
2786 *
2787 * This function returns %0 if the index node is not referred to in the TNC, %1
2788 * if the index node is referred to in the TNC and the corresponding znode is
2789 * dirty, %2 if an index node is referred to in the TNC and the corresponding
2790 * znode is clean, and a negative error code in case of failure.
2791 *
2792 * Note, the @key argument has to be the key of the first child. Also note,
2793 * this function relies on the fact that 0:0 is never a valid LEB number and
2794 * offset for a main-area node.
2795 */
2796int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
2797 int lnum, int offs)
2798{
2799 struct ubifs_znode *znode;
2800
2801 znode = lookup_znode(c, key, level, lnum, offs);
2802 if (!znode)
2803 return 0;
2804 if (IS_ERR(znode))
2805 return PTR_ERR(znode);
2806
2807 return ubifs_zn_dirty(znode) ? 1 : 2;
2808}
2809
2810/**
2811 * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
2812 * @c: UBIFS file-system description object
2813 * @key: node key
2814 * @lnum: node LEB number
2815 * @offs: node offset
2816 *
2817 * This function returns %1 if the node is referred to in the TNC, %0 if it is
2818 * not, and a negative error code in case of failure.
2819 *
2820 * Note, this function relies on the fact that 0:0 is never a valid LEB number
2821 * and offset for a main-area node.
2822 */
2823static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
2824 int lnum, int offs)
2825{
2826 struct ubifs_zbranch *zbr;
2827 struct ubifs_znode *znode, *zn;
2828 int n, found, err, nn;
2829 const int unique = !is_hash_key(c, key);
2830
2831 found = ubifs_lookup_level0(c, key, &znode, &n);
2832 if (found < 0)
2833 return found; /* Error code */
2834 if (!found)
2835 return 0;
2836 zbr = &znode->zbranch[n];
2837 if (lnum == zbr->lnum && offs == zbr->offs)
2838 return 1; /* Found it */
2839 if (unique)
2840 return 0;
2841 /*
2842 * Because the key is not unique, we have to look left
2843 * and right as well
2844 */
2845 zn = znode;
2846 nn = n;
2847 /* Look left */
2848 while (1) {
2849 err = tnc_prev(c, &znode, &n);
2850 if (err == -ENOENT)
2851 break;
2852 if (err)
2853 return err;
2854 if (keys_cmp(c, key, &znode->zbranch[n].key))
2855 break;
2856 zbr = &znode->zbranch[n];
2857 if (lnum == zbr->lnum && offs == zbr->offs)
2858 return 1; /* Found it */
2859 }
2860 /* Look right */
2861 znode = zn;
2862 n = nn;
2863 while (1) {
2864 err = tnc_next(c, &znode, &n);
2865 if (err) {
2866 if (err == -ENOENT)
2867 return 0;
2868 return err;
2869 }
2870 if (keys_cmp(c, key, &znode->zbranch[n].key))
2871 break;
2872 zbr = &znode->zbranch[n];
2873 if (lnum == zbr->lnum && offs == zbr->offs)
2874 return 1; /* Found it */
2875 }
2876 return 0;
2877}
2878
2879/**
2880 * ubifs_tnc_has_node - determine whether a node is in the TNC.
2881 * @c: UBIFS file-system description object
2882 * @key: node key
2883 * @level: index node level (if it is an index node)
2884 * @lnum: node LEB number
2885 * @offs: node offset
2886 * @is_idx: non-zero if the node is an index node
2887 *
2888 * This function returns %1 if the node is in the TNC, %0 if it is not, and a
2889 * negative error code in case of failure. For index nodes, @key has to be the
2890 * key of the first child. An index node is considered to be in the TNC only if
2891 * the corresponding znode is clean or has not been loaded.
2892 */
2893int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
2894 int lnum, int offs, int is_idx)
2895{
2896 int err;
2897
2898 mutex_lock(&c->tnc_mutex);
2899 if (is_idx) {
2900 err = is_idx_node_in_tnc(c, key, level, lnum, offs);
2901 if (err < 0)
2902 goto out_unlock;
2903 if (err == 1)
2904 /* The index node was found but it was dirty */
2905 err = 0;
2906 else if (err == 2)
2907 /* The index node was found and it was clean */
2908 err = 1;
2909 else
2910 BUG_ON(err != 0);
2911 } else
2912 err = is_leaf_node_in_tnc(c, key, lnum, offs);
2913
2914out_unlock:
2915 mutex_unlock(&c->tnc_mutex);
2916 return err;
2917}
2918
2919/**
2920 * ubifs_dirty_idx_node - dirty an index node.
2921 * @c: UBIFS file-system description object
2922 * @key: index node key
2923 * @level: index node level
2924 * @lnum: index node LEB number
2925 * @offs: index node offset
2926 *
2927 * This function loads and dirties an index node so that it can be garbage
2928 * collected. The @key argument has to be the key of the first child. This
2929 * function relies on the fact that 0:0 is never a valid LEB number and offset
2930 * for a main-area node. Returns %0 on success and a negative error code on
2931 * failure.
2932 */
2933int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
2934 int lnum, int offs)
2935{
2936 struct ubifs_znode *znode;
2937 int err = 0;
2938
2939 mutex_lock(&c->tnc_mutex);
2940 znode = lookup_znode(c, key, level, lnum, offs);
2941 if (!znode)
2942 goto out_unlock;
2943 if (IS_ERR(znode)) {
2944 err = PTR_ERR(znode);
2945 goto out_unlock;
2946 }
2947 znode = dirty_cow_bottom_up(c, znode);
2948 if (IS_ERR(znode)) {
2949 err = PTR_ERR(znode);
2950 goto out_unlock;
2951 }
2952
2953out_unlock:
2954 mutex_unlock(&c->tnc_mutex);
2955 return err;
2956}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
new file mode 100644
index 000000000000..8117e65ba2e9
--- /dev/null
+++ b/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1103 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/* This file implements TNC functions for committing */
24
25#include "ubifs.h"
26
27/**
28 * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
29 * @c: UBIFS file-system description object
30 * @idx: buffer in which to place new index node
31 * @znode: znode from which to make new index node
32 * @lnum: LEB number where new index node will be written
33 * @offs: offset where new index node will be written
34 * @len: length of new index node
35 */
36static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
37 struct ubifs_znode *znode, int lnum, int offs, int len)
38{
39 struct ubifs_znode *zp;
40 int i, err;
41
42 /* Make index node */
43 idx->ch.node_type = UBIFS_IDX_NODE;
44 idx->child_cnt = cpu_to_le16(znode->child_cnt);
45 idx->level = cpu_to_le16(znode->level);
46 for (i = 0; i < znode->child_cnt; i++) {
47 struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
48 struct ubifs_zbranch *zbr = &znode->zbranch[i];
49
50 key_write_idx(c, &zbr->key, &br->key);
51 br->lnum = cpu_to_le32(zbr->lnum);
52 br->offs = cpu_to_le32(zbr->offs);
53 br->len = cpu_to_le32(zbr->len);
54 if (!zbr->lnum || !zbr->len) {
55 ubifs_err("bad ref in znode");
56 dbg_dump_znode(c, znode);
57 if (zbr->znode)
58 dbg_dump_znode(c, zbr->znode);
59 }
60 }
61 ubifs_prepare_node(c, idx, len, 0);
62
63#ifdef CONFIG_UBIFS_FS_DEBUG
64 znode->lnum = lnum;
65 znode->offs = offs;
66 znode->len = len;
67#endif
68
69 err = insert_old_idx_znode(c, znode);
70
71 /* Update the parent */
72 zp = znode->parent;
73 if (zp) {
74 struct ubifs_zbranch *zbr;
75
76 zbr = &zp->zbranch[znode->iip];
77 zbr->lnum = lnum;
78 zbr->offs = offs;
79 zbr->len = len;
80 } else {
81 c->zroot.lnum = lnum;
82 c->zroot.offs = offs;
83 c->zroot.len = len;
84 }
85 c->calc_idx_sz += ALIGN(len, 8);
86
87 atomic_long_dec(&c->dirty_zn_cnt);
88
89 ubifs_assert(ubifs_zn_dirty(znode));
90 ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
91
92 __clear_bit(DIRTY_ZNODE, &znode->flags);
93 __clear_bit(COW_ZNODE, &znode->flags);
94
95 return err;
96}
97
98/**
99 * fill_gap - make index nodes in gaps in dirty index LEBs.
100 * @c: UBIFS file-system description object
101 * @lnum: LEB number that gap appears in
102 * @gap_start: offset of start of gap
103 * @gap_end: offset of end of gap
104 * @dirt: adds dirty space to this
105 *
106 * This function returns the number of index nodes written into the gap.
107 */
108static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
109 int *dirt)
110{
111 int len, gap_remains, gap_pos, written, pad_len;
112
113 ubifs_assert((gap_start & 7) == 0);
114 ubifs_assert((gap_end & 7) == 0);
115 ubifs_assert(gap_end >= gap_start);
116
117 gap_remains = gap_end - gap_start;
118 if (!gap_remains)
119 return 0;
120 gap_pos = gap_start;
121 written = 0;
122 while (c->enext) {
123 len = ubifs_idx_node_sz(c, c->enext->child_cnt);
124 if (len < gap_remains) {
125 struct ubifs_znode *znode = c->enext;
126 const int alen = ALIGN(len, 8);
127 int err;
128
129 ubifs_assert(alen <= gap_remains);
130 err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
131 lnum, gap_pos, len);
132 if (err)
133 return err;
134 gap_remains -= alen;
135 gap_pos += alen;
136 c->enext = znode->cnext;
137 if (c->enext == c->cnext)
138 c->enext = NULL;
139 written += 1;
140 } else
141 break;
142 }
143 if (gap_end == c->leb_size) {
144 c->ileb_len = ALIGN(gap_pos, c->min_io_size);
145 /* Pad to end of min_io_size */
146 pad_len = c->ileb_len - gap_pos;
147 } else
148 /* Pad to end of gap */
149 pad_len = gap_remains;
150 dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
151 lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
152 ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
153 *dirt += pad_len;
154 return written;
155}
156
157/**
158 * find_old_idx - find an index node obsoleted since the last commit start.
159 * @c: UBIFS file-system description object
160 * @lnum: LEB number of obsoleted index node
161 * @offs: offset of obsoleted index node
162 *
163 * Returns %1 if found and %0 otherwise.
164 */
165static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
166{
167 struct ubifs_old_idx *o;
168 struct rb_node *p;
169
170 p = c->old_idx.rb_node;
171 while (p) {
172 o = rb_entry(p, struct ubifs_old_idx, rb);
173 if (lnum < o->lnum)
174 p = p->rb_left;
175 else if (lnum > o->lnum)
176 p = p->rb_right;
177 else if (offs < o->offs)
178 p = p->rb_left;
179 else if (offs > o->offs)
180 p = p->rb_right;
181 else
182 return 1;
183 }
184 return 0;
185}
186
187/**
188 * is_idx_node_in_use - determine if an index node can be overwritten.
189 * @c: UBIFS file-system description object
190 * @key: key of index node
191 * @level: index node level
192 * @lnum: LEB number of index node
193 * @offs: offset of index node
194 *
195 * If @key / @lnum / @offs identify an index node that was not part of the old
196 * index, then this function returns %0 (obsolete). Else if the index node was
197 * part of the old index but is now dirty %1 is returned, else if it is clean %2
198 * is returned. A negative error code is returned on failure.
199 */
200static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
201 int level, int lnum, int offs)
202{
203 int ret;
204
205 ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
206 if (ret < 0)
207 return ret; /* Error code */
208 if (ret == 0)
209 if (find_old_idx(c, lnum, offs))
210 return 1;
211 return ret;
212}
213
214/**
215 * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
216 * @c: UBIFS file-system description object
217 * @p: return LEB number here
218 *
219 * This function lays out new index nodes for dirty znodes using in-the-gaps
220 * method of TNC commit.
221 * This function merely puts the next znode into the next gap, making no attempt
222 * to try to maximise the number of znodes that fit.
223 * This function returns the number of index nodes written into the gaps, or a
224 * negative error code on failure.
225 */
226static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
227{
228 struct ubifs_scan_leb *sleb;
229 struct ubifs_scan_node *snod;
230 int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
231
232 tot_written = 0;
233 /* Get an index LEB with lots of obsolete index nodes */
234 lnum = ubifs_find_dirty_idx_leb(c);
235 if (lnum < 0)
236 /*
237 * There also may be dirt in the index head that could be
238 * filled, however we do not check there at present.
239 */
240 return lnum; /* Error code */
241 *p = lnum;
242 dbg_gc("LEB %d", lnum);
243 /*
244 * Scan the index LEB. We use the generic scan for this even though
245 * it is more comprehensive and less efficient than is needed for this
246 * purpose.
247 */
248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
249 c->ileb_len = 0;
250 if (IS_ERR(sleb))
251 return PTR_ERR(sleb);
252 gap_start = 0;
253 list_for_each_entry(snod, &sleb->nodes, list) {
254 struct ubifs_idx_node *idx;
255 int in_use, level;
256
257 ubifs_assert(snod->type == UBIFS_IDX_NODE);
258 idx = snod->node;
259 key_read(c, ubifs_idx_key(c, idx), &snod->key);
260 level = le16_to_cpu(idx->level);
261 /* Determine if the index node is in use (not obsolete) */
262 in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
263 snod->offs);
264 if (in_use < 0) {
265 ubifs_scan_destroy(sleb);
266 return in_use; /* Error code */
267 }
268 if (in_use) {
269 if (in_use == 1)
270 dirt += ALIGN(snod->len, 8);
271 /*
272 * The obsolete index nodes form gaps that can be
273 * overwritten. This gap has ended because we have
274 * found an index node that is still in use
275 * i.e. not obsolete
276 */
277 gap_end = snod->offs;
278 /* Try to fill gap */
279 written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
280 if (written < 0) {
281 ubifs_scan_destroy(sleb);
282 return written; /* Error code */
283 }
284 tot_written += written;
285 gap_start = ALIGN(snod->offs + snod->len, 8);
286 }
287 }
288 ubifs_scan_destroy(sleb);
289 c->ileb_len = c->leb_size;
290 gap_end = c->leb_size;
291 /* Try to fill gap */
292 written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
293 if (written < 0)
294 return written; /* Error code */
295 tot_written += written;
296 if (tot_written == 0) {
297 struct ubifs_lprops lp;
298
299 dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
300 err = ubifs_read_one_lp(c, lnum, &lp);
301 if (err)
302 return err;
303 if (lp.free == c->leb_size) {
304 /*
305 * We must have snatched this LEB from the idx_gc list
306 * so we need to correct the free and dirty space.
307 */
308 err = ubifs_change_one_lp(c, lnum,
309 c->leb_size - c->ileb_len,
310 dirt, 0, 0, 0);
311 if (err)
312 return err;
313 }
314 return 0;
315 }
316 err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
317 0, 0, 0);
318 if (err)
319 return err;
320 err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
321 UBI_SHORTTERM);
322 if (err)
323 return err;
324 dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
325 return tot_written;
326}
327
328/**
329 * get_leb_cnt - calculate the number of empty LEBs needed to commit.
330 * @c: UBIFS file-system description object
331 * @cnt: number of znodes to commit
332 *
333 * This function returns the number of empty LEBs needed to commit @cnt znodes
334 * to the current index head. The number is not exact and may be more than
335 * needed.
336 */
337static int get_leb_cnt(struct ubifs_info *c, int cnt)
338{
339 int d;
340
341 /* Assume maximum index node size (i.e. overestimate space needed) */
342 cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
343 if (cnt < 0)
344 cnt = 0;
345 d = c->leb_size / c->max_idx_node_sz;
346 return DIV_ROUND_UP(cnt, d);
347}
348
349/**
350 * layout_in_gaps - in-the-gaps method of committing TNC.
351 * @c: UBIFS file-system description object
352 * @cnt: number of dirty znodes to commit.
353 *
354 * This function lays out new index nodes for dirty znodes using in-the-gaps
355 * method of TNC commit.
356 *
357 * This function returns %0 on success and a negative error code on failure.
358 */
359static int layout_in_gaps(struct ubifs_info *c, int cnt)
360{
361 int err, leb_needed_cnt, written, *p;
362
363 dbg_gc("%d znodes to write", cnt);
364
365 c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
366 if (!c->gap_lebs)
367 return -ENOMEM;
368
369 p = c->gap_lebs;
370 do {
371 ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
372 written = layout_leb_in_gaps(c, p);
373 if (written < 0) {
374 err = written;
375 if (err == -ENOSPC) {
376 if (!dbg_force_in_the_gaps_enabled) {
377 /*
378 * Do not print scary warnings if the
379 * debugging option which forces
380 * in-the-gaps is enabled.
381 */
382 ubifs_err("out of space");
383 spin_lock(&c->space_lock);
384 dbg_dump_budg(c);
385 spin_unlock(&c->space_lock);
386 dbg_dump_lprops(c);
387 }
388 /* Try to commit anyway */
389 err = 0;
390 break;
391 }
392 kfree(c->gap_lebs);
393 c->gap_lebs = NULL;
394 return err;
395 }
396 p++;
397 cnt -= written;
398 leb_needed_cnt = get_leb_cnt(c, cnt);
399 dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
400 leb_needed_cnt, c->ileb_cnt);
401 } while (leb_needed_cnt > c->ileb_cnt);
402
403 *p = -1;
404 return 0;
405}
406
407/**
408 * layout_in_empty_space - layout index nodes in empty space.
409 * @c: UBIFS file-system description object
410 *
411 * This function lays out new index nodes for dirty znodes using empty LEBs.
412 *
413 * This function returns %0 on success and a negative error code on failure.
414 */
415static int layout_in_empty_space(struct ubifs_info *c)
416{
417 struct ubifs_znode *znode, *cnext, *zp;
418 int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
419 int wlen, blen, err;
420
421 cnext = c->enext;
422 if (!cnext)
423 return 0;
424
425 lnum = c->ihead_lnum;
426 buf_offs = c->ihead_offs;
427
428 buf_len = ubifs_idx_node_sz(c, c->fanout);
429 buf_len = ALIGN(buf_len, c->min_io_size);
430 used = 0;
431 avail = buf_len;
432
433 /* Ensure there is enough room for first write */
434 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
435 if (buf_offs + next_len > c->leb_size)
436 lnum = -1;
437
438 while (1) {
439 znode = cnext;
440
441 len = ubifs_idx_node_sz(c, znode->child_cnt);
442
443 /* Determine the index node position */
444 if (lnum == -1) {
445 if (c->ileb_nxt >= c->ileb_cnt) {
446 ubifs_err("out of space");
447 return -ENOSPC;
448 }
449 lnum = c->ilebs[c->ileb_nxt++];
450 buf_offs = 0;
451 used = 0;
452 avail = buf_len;
453 }
454
455 offs = buf_offs + used;
456
457#ifdef CONFIG_UBIFS_FS_DEBUG
458 znode->lnum = lnum;
459 znode->offs = offs;
460 znode->len = len;
461#endif
462
463 /* Update the parent */
464 zp = znode->parent;
465 if (zp) {
466 struct ubifs_zbranch *zbr;
467 int i;
468
469 i = znode->iip;
470 zbr = &zp->zbranch[i];
471 zbr->lnum = lnum;
472 zbr->offs = offs;
473 zbr->len = len;
474 } else {
475 c->zroot.lnum = lnum;
476 c->zroot.offs = offs;
477 c->zroot.len = len;
478 }
479 c->calc_idx_sz += ALIGN(len, 8);
480
481 /*
482 * Once lprops is updated, we can decrease the dirty znode count
483 * but it is easier to just do it here.
484 */
485 atomic_long_dec(&c->dirty_zn_cnt);
486
487 /*
488 * Calculate the next index node length to see if there is
489 * enough room for it
490 */
491 cnext = znode->cnext;
492 if (cnext == c->cnext)
493 next_len = 0;
494 else
495 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
496
497 if (c->min_io_size == 1) {
498 buf_offs += ALIGN(len, 8);
499 if (next_len) {
500 if (buf_offs + next_len <= c->leb_size)
501 continue;
502 err = ubifs_update_one_lp(c, lnum, 0,
503 c->leb_size - buf_offs, 0, 0);
504 if (err)
505 return err;
506 lnum = -1;
507 continue;
508 }
509 err = ubifs_update_one_lp(c, lnum,
510 c->leb_size - buf_offs, 0, 0, 0);
511 if (err)
512 return err;
513 break;
514 }
515
516 /* Update buffer positions */
517 wlen = used + len;
518 used += ALIGN(len, 8);
519 avail -= ALIGN(len, 8);
520
521 if (next_len != 0 &&
522 buf_offs + used + next_len <= c->leb_size &&
523 avail > 0)
524 continue;
525
526 if (avail <= 0 && next_len &&
527 buf_offs + used + next_len <= c->leb_size)
528 blen = buf_len;
529 else
530 blen = ALIGN(wlen, c->min_io_size);
531
532 /* The buffer is full or there are no more znodes to do */
533 buf_offs += blen;
534 if (next_len) {
535 if (buf_offs + next_len > c->leb_size) {
536 err = ubifs_update_one_lp(c, lnum,
537 c->leb_size - buf_offs, blen - used,
538 0, 0);
539 if (err)
540 return err;
541 lnum = -1;
542 }
543 used -= blen;
544 if (used < 0)
545 used = 0;
546 avail = buf_len - used;
547 continue;
548 }
549 err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
550 blen - used, 0, 0);
551 if (err)
552 return err;
553 break;
554 }
555
556#ifdef CONFIG_UBIFS_FS_DEBUG
557 c->new_ihead_lnum = lnum;
558 c->new_ihead_offs = buf_offs;
559#endif
560
561 return 0;
562}
563
564/**
565 * layout_commit - determine positions of index nodes to commit.
566 * @c: UBIFS file-system description object
567 * @no_space: indicates that insufficient empty LEBs were allocated
568 * @cnt: number of znodes to commit
569 *
570 * Calculate and update the positions of index nodes to commit. If there were
571 * an insufficient number of empty LEBs allocated, then index nodes are placed
572 * into the gaps created by obsolete index nodes in non-empty index LEBs. For
573 * this purpose, an obsolete index node is one that was not in the index as at
574 * the end of the last commit. To write "in-the-gaps" requires that those index
575 * LEBs are updated atomically in-place.
576 */
577static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
578{
579 int err;
580
581 if (no_space) {
582 err = layout_in_gaps(c, cnt);
583 if (err)
584 return err;
585 }
586 err = layout_in_empty_space(c);
587 return err;
588}
589
590/**
591 * find_first_dirty - find first dirty znode.
592 * @znode: znode to begin searching from
593 */
594static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
595{
596 int i, cont;
597
598 if (!znode)
599 return NULL;
600
601 while (1) {
602 if (znode->level == 0) {
603 if (ubifs_zn_dirty(znode))
604 return znode;
605 return NULL;
606 }
607 cont = 0;
608 for (i = 0; i < znode->child_cnt; i++) {
609 struct ubifs_zbranch *zbr = &znode->zbranch[i];
610
611 if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
612 znode = zbr->znode;
613 cont = 1;
614 break;
615 }
616 }
617 if (!cont) {
618 if (ubifs_zn_dirty(znode))
619 return znode;
620 return NULL;
621 }
622 }
623}
624
625/**
626 * find_next_dirty - find next dirty znode.
627 * @znode: znode to begin searching from
628 */
629static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
630{
631 int n = znode->iip + 1;
632
633 znode = znode->parent;
634 if (!znode)
635 return NULL;
636 for (; n < znode->child_cnt; n++) {
637 struct ubifs_zbranch *zbr = &znode->zbranch[n];
638
639 if (zbr->znode && ubifs_zn_dirty(zbr->znode))
640 return find_first_dirty(zbr->znode);
641 }
642 return znode;
643}
644
645/**
646 * get_znodes_to_commit - create list of dirty znodes to commit.
647 * @c: UBIFS file-system description object
648 *
649 * This function returns the number of znodes to commit.
650 */
651static int get_znodes_to_commit(struct ubifs_info *c)
652{
653 struct ubifs_znode *znode, *cnext;
654 int cnt = 0;
655
656 c->cnext = find_first_dirty(c->zroot.znode);
657 znode = c->enext = c->cnext;
658 if (!znode) {
659 dbg_cmt("no znodes to commit");
660 return 0;
661 }
662 cnt += 1;
663 while (1) {
664 ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
665 __set_bit(COW_ZNODE, &znode->flags);
666 znode->alt = 0;
667 cnext = find_next_dirty(znode);
668 if (!cnext) {
669 znode->cnext = c->cnext;
670 break;
671 }
672 znode->cnext = cnext;
673 znode = cnext;
674 cnt += 1;
675 }
676 dbg_cmt("committing %d znodes", cnt);
677 ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
678 return cnt;
679}
680
681/**
682 * alloc_idx_lebs - allocate empty LEBs to be used to commit.
683 * @c: UBIFS file-system description object
684 * @cnt: number of znodes to commit
685 *
686 * This function returns %-ENOSPC if it cannot allocate a sufficient number of
687 * empty LEBs. %0 is returned on success, otherwise a negative error code
688 * is returned.
689 */
690static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
691{
692 int i, leb_cnt, lnum;
693
694 c->ileb_cnt = 0;
695 c->ileb_nxt = 0;
696 leb_cnt = get_leb_cnt(c, cnt);
697 dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
698 if (!leb_cnt)
699 return 0;
700 c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
701 if (!c->ilebs)
702 return -ENOMEM;
703 for (i = 0; i < leb_cnt; i++) {
704 lnum = ubifs_find_free_leb_for_idx(c);
705 if (lnum < 0)
706 return lnum;
707 c->ilebs[c->ileb_cnt++] = lnum;
708 dbg_cmt("LEB %d", lnum);
709 }
710 if (dbg_force_in_the_gaps())
711 return -ENOSPC;
712 return 0;
713}
714
715/**
716 * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
717 * @c: UBIFS file-system description object
718 *
719 * It is possible that we allocate more empty LEBs for the commit than we need.
720 * This functions frees the surplus.
721 *
722 * This function returns %0 on success and a negative error code on failure.
723 */
724static int free_unused_idx_lebs(struct ubifs_info *c)
725{
726 int i, err = 0, lnum, er;
727
728 for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
729 lnum = c->ilebs[i];
730 dbg_cmt("LEB %d", lnum);
731 er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
732 LPROPS_INDEX | LPROPS_TAKEN, 0);
733 if (!err)
734 err = er;
735 }
736 return err;
737}
738
739/**
740 * free_idx_lebs - free unused LEBs after commit end.
741 * @c: UBIFS file-system description object
742 *
743 * This function returns %0 on success and a negative error code on failure.
744 */
745static int free_idx_lebs(struct ubifs_info *c)
746{
747 int err;
748
749 err = free_unused_idx_lebs(c);
750 kfree(c->ilebs);
751 c->ilebs = NULL;
752 return err;
753}
754
755/**
756 * ubifs_tnc_start_commit - start TNC commit.
757 * @c: UBIFS file-system description object
758 * @zroot: new index root position is returned here
759 *
760 * This function prepares the list of indexing nodes to commit and lays out
761 * their positions on flash. If there is not enough free space it uses the
762 * in-gap commit method. Returns zero in case of success and a negative error
763 * code in case of failure.
764 */
765int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
766{
767 int err = 0, cnt;
768
769 mutex_lock(&c->tnc_mutex);
770 err = dbg_check_tnc(c, 1);
771 if (err)
772 goto out;
773 cnt = get_znodes_to_commit(c);
774 if (cnt != 0) {
775 int no_space = 0;
776
777 err = alloc_idx_lebs(c, cnt);
778 if (err == -ENOSPC)
779 no_space = 1;
780 else if (err)
781 goto out_free;
782 err = layout_commit(c, no_space, cnt);
783 if (err)
784 goto out_free;
785 ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
786 err = free_unused_idx_lebs(c);
787 if (err)
788 goto out;
789 }
790 destroy_old_idx(c);
791 memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
792
793 err = ubifs_save_dirty_idx_lnums(c);
794 if (err)
795 goto out;
796
797 spin_lock(&c->space_lock);
798 /*
799 * Although we have not finished committing yet, update size of the
800 * committed index ('c->old_idx_sz') and zero out the index growth
801 * budget. It is OK to do this now, because we've reserved all the
802 * space which is needed to commit the index, and it is save for the
803 * budgeting subsystem to assume the index is already committed,
804 * even though it is not.
805 */
806 c->old_idx_sz = c->calc_idx_sz;
807 c->budg_uncommitted_idx = 0;
808 spin_unlock(&c->space_lock);
809 mutex_unlock(&c->tnc_mutex);
810
811 dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
812 dbg_cmt("size of index %llu", c->calc_idx_sz);
813 return err;
814
815out_free:
816 free_idx_lebs(c);
817out:
818 mutex_unlock(&c->tnc_mutex);
819 return err;
820}
821
822/**
823 * write_index - write index nodes.
824 * @c: UBIFS file-system description object
825 *
826 * This function writes the index nodes whose positions were laid out in the
827 * layout_in_empty_space function.
828 */
829static int write_index(struct ubifs_info *c)
830{
831 struct ubifs_idx_node *idx;
832 struct ubifs_znode *znode, *cnext;
833 int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
834 int avail, wlen, err, lnum_pos = 0;
835
836 cnext = c->enext;
837 if (!cnext)
838 return 0;
839
840 /*
841 * Always write index nodes to the index head so that index nodes and
842 * other types of nodes are never mixed in the same erase block.
843 */
844 lnum = c->ihead_lnum;
845 buf_offs = c->ihead_offs;
846
847 /* Allocate commit buffer */
848 buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
849 used = 0;
850 avail = buf_len;
851
852 /* Ensure there is enough room for first write */
853 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
854 if (buf_offs + next_len > c->leb_size) {
855 err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
856 LPROPS_TAKEN);
857 if (err)
858 return err;
859 lnum = -1;
860 }
861
862 while (1) {
863 cond_resched();
864
865 znode = cnext;
866 idx = c->cbuf + used;
867
868 /* Make index node */
869 idx->ch.node_type = UBIFS_IDX_NODE;
870 idx->child_cnt = cpu_to_le16(znode->child_cnt);
871 idx->level = cpu_to_le16(znode->level);
872 for (i = 0; i < znode->child_cnt; i++) {
873 struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
874 struct ubifs_zbranch *zbr = &znode->zbranch[i];
875
876 key_write_idx(c, &zbr->key, &br->key);
877 br->lnum = cpu_to_le32(zbr->lnum);
878 br->offs = cpu_to_le32(zbr->offs);
879 br->len = cpu_to_le32(zbr->len);
880 if (!zbr->lnum || !zbr->len) {
881 ubifs_err("bad ref in znode");
882 dbg_dump_znode(c, znode);
883 if (zbr->znode)
884 dbg_dump_znode(c, zbr->znode);
885 }
886 }
887 len = ubifs_idx_node_sz(c, znode->child_cnt);
888 ubifs_prepare_node(c, idx, len, 0);
889
890 /* Determine the index node position */
891 if (lnum == -1) {
892 lnum = c->ilebs[lnum_pos++];
893 buf_offs = 0;
894 used = 0;
895 avail = buf_len;
896 }
897 offs = buf_offs + used;
898
899#ifdef CONFIG_UBIFS_FS_DEBUG
900 if (lnum != znode->lnum || offs != znode->offs ||
901 len != znode->len) {
902 ubifs_err("inconsistent znode posn");
903 return -EINVAL;
904 }
905#endif
906
907 /* Grab some stuff from znode while we still can */
908 cnext = znode->cnext;
909
910 ubifs_assert(ubifs_zn_dirty(znode));
911 ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
912
913 /*
914 * It is important that other threads should see %DIRTY_ZNODE
915 * flag cleared before %COW_ZNODE. Specifically, it matters in
916 * the 'dirty_cow_znode()' function. This is the reason for the
917 * first barrier. Also, we want the bit changes to be seen to
918 * other threads ASAP, to avoid unnecesarry copying, which is
919 * the reason for the second barrier.
920 */
921 clear_bit(DIRTY_ZNODE, &znode->flags);
922 smp_mb__before_clear_bit();
923 clear_bit(COW_ZNODE, &znode->flags);
924 smp_mb__after_clear_bit();
925
926 /* Do not access znode from this point on */
927
928 /* Update buffer positions */
929 wlen = used + len;
930 used += ALIGN(len, 8);
931 avail -= ALIGN(len, 8);
932
933 /*
934 * Calculate the next index node length to see if there is
935 * enough room for it
936 */
937 if (cnext == c->cnext)
938 next_len = 0;
939 else
940 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
941
942 if (c->min_io_size == 1) {
943 /*
944 * Write the prepared index node immediately if there is
945 * no minimum IO size
946 */
947 err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
948 wlen, UBI_SHORTTERM);
949 if (err)
950 return err;
951 buf_offs += ALIGN(wlen, 8);
952 if (next_len) {
953 used = 0;
954 avail = buf_len;
955 if (buf_offs + next_len > c->leb_size) {
956 err = ubifs_update_one_lp(c, lnum,
957 LPROPS_NC, 0, 0, LPROPS_TAKEN);
958 if (err)
959 return err;
960 lnum = -1;
961 }
962 continue;
963 }
964 } else {
965 int blen, nxt_offs = buf_offs + used + next_len;
966
967 if (next_len && nxt_offs <= c->leb_size) {
968 if (avail > 0)
969 continue;
970 else
971 blen = buf_len;
972 } else {
973 wlen = ALIGN(wlen, 8);
974 blen = ALIGN(wlen, c->min_io_size);
975 ubifs_pad(c, c->cbuf + wlen, blen - wlen);
976 }
977 /*
978 * The buffer is full or there are no more znodes
979 * to do
980 */
981 err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
982 blen, UBI_SHORTTERM);
983 if (err)
984 return err;
985 buf_offs += blen;
986 if (next_len) {
987 if (nxt_offs > c->leb_size) {
988 err = ubifs_update_one_lp(c, lnum,
989 LPROPS_NC, 0, 0, LPROPS_TAKEN);
990 if (err)
991 return err;
992 lnum = -1;
993 }
994 used -= blen;
995 if (used < 0)
996 used = 0;
997 avail = buf_len - used;
998 memmove(c->cbuf, c->cbuf + blen, used);
999 continue;
1000 }
1001 }
1002 break;
1003 }
1004
1005#ifdef CONFIG_UBIFS_FS_DEBUG
1006 if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
1007 ubifs_err("inconsistent ihead");
1008 return -EINVAL;
1009 }
1010#endif
1011
1012 c->ihead_lnum = lnum;
1013 c->ihead_offs = buf_offs;
1014
1015 return 0;
1016}
1017
1018/**
1019 * free_obsolete_znodes - free obsolete znodes.
1020 * @c: UBIFS file-system description object
1021 *
1022 * At the end of commit end, obsolete znodes are freed.
1023 */
1024static void free_obsolete_znodes(struct ubifs_info *c)
1025{
1026 struct ubifs_znode *znode, *cnext;
1027
1028 cnext = c->cnext;
1029 do {
1030 znode = cnext;
1031 cnext = znode->cnext;
1032 if (test_bit(OBSOLETE_ZNODE, &znode->flags))
1033 kfree(znode);
1034 else {
1035 znode->cnext = NULL;
1036 atomic_long_inc(&c->clean_zn_cnt);
1037 atomic_long_inc(&ubifs_clean_zn_cnt);
1038 }
1039 } while (cnext != c->cnext);
1040}
1041
1042/**
1043 * return_gap_lebs - return LEBs used by the in-gap commit method.
1044 * @c: UBIFS file-system description object
1045 *
1046 * This function clears the "taken" flag for the LEBs which were used by the
1047 * "commit in-the-gaps" method.
1048 */
1049static int return_gap_lebs(struct ubifs_info *c)
1050{
1051 int *p, err;
1052
1053 if (!c->gap_lebs)
1054 return 0;
1055
1056 dbg_cmt("");
1057 for (p = c->gap_lebs; *p != -1; p++) {
1058 err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
1059 LPROPS_TAKEN, 0);
1060 if (err)
1061 return err;
1062 }
1063
1064 kfree(c->gap_lebs);
1065 c->gap_lebs = NULL;
1066 return 0;
1067}
1068
1069/**
1070 * ubifs_tnc_end_commit - update the TNC for commit end.
1071 * @c: UBIFS file-system description object
1072 *
1073 * Write the dirty znodes.
1074 */
1075int ubifs_tnc_end_commit(struct ubifs_info *c)
1076{
1077 int err;
1078
1079 if (!c->cnext)
1080 return 0;
1081
1082 err = return_gap_lebs(c);
1083 if (err)
1084 return err;
1085
1086 err = write_index(c);
1087 if (err)
1088 return err;
1089
1090 mutex_lock(&c->tnc_mutex);
1091
1092 dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
1093
1094 free_obsolete_znodes(c);
1095
1096 c->cnext = NULL;
1097 kfree(c->ilebs);
1098 c->ilebs = NULL;
1099
1100 mutex_unlock(&c->tnc_mutex);
1101
1102 return 0;
1103}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
new file mode 100644
index 000000000000..a25c1cc1f8d9
--- /dev/null
+++ b/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file contains miscelanious TNC-related functions shared betweend
25 * different files. This file does not form any logically separate TNC
26 * sub-system. The file was created because there is a lot of TNC code and
27 * putting it all in one file would make that file too big and unreadable.
28 */
29
30#include "ubifs.h"
31
32/**
33 * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
34 * @zr: root of the subtree to traverse
35 * @znode: previous znode
36 *
37 * This function implements levelorder TNC traversal. The LNC is ignored.
38 * Returns the next element or %NULL if @znode is already the last one.
39 */
40struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
41 struct ubifs_znode *znode)
42{
43 int level, iip, level_search = 0;
44 struct ubifs_znode *zn;
45
46 ubifs_assert(zr);
47
48 if (unlikely(!znode))
49 return zr;
50
51 if (unlikely(znode == zr)) {
52 if (znode->level == 0)
53 return NULL;
54 return ubifs_tnc_find_child(zr, 0);
55 }
56
57 level = znode->level;
58
59 iip = znode->iip;
60 while (1) {
61 ubifs_assert(znode->level <= zr->level);
62
63 /*
64 * First walk up until there is a znode with next branch to
65 * look at.
66 */
67 while (znode->parent != zr && iip >= znode->parent->child_cnt) {
68 znode = znode->parent;
69 iip = znode->iip;
70 }
71
72 if (unlikely(znode->parent == zr &&
73 iip >= znode->parent->child_cnt)) {
74 /* This level is done, switch to the lower one */
75 level -= 1;
76 if (level_search || level < 0)
77 /*
78 * We were already looking for znode at lower
79 * level ('level_search'). As we are here
80 * again, it just does not exist. Or all levels
81 * were finished ('level < 0').
82 */
83 return NULL;
84
85 level_search = 1;
86 iip = -1;
87 znode = ubifs_tnc_find_child(zr, 0);
88 ubifs_assert(znode);
89 }
90
91 /* Switch to the next index */
92 zn = ubifs_tnc_find_child(znode->parent, iip + 1);
93 if (!zn) {
94 /* No more children to look at, we have walk up */
95 iip = znode->parent->child_cnt;
96 continue;
97 }
98
99 /* Walk back down to the level we came from ('level') */
100 while (zn->level != level) {
101 znode = zn;
102 zn = ubifs_tnc_find_child(zn, 0);
103 if (!zn) {
104 /*
105 * This path is not too deep so it does not
106 * reach 'level'. Try next path.
107 */
108 iip = znode->iip;
109 break;
110 }
111 }
112
113 if (zn) {
114 ubifs_assert(zn->level >= 0);
115 return zn;
116 }
117 }
118}
119
120/**
121 * ubifs_search_zbranch - search znode branch.
122 * @c: UBIFS file-system description object
123 * @znode: znode to search in
124 * @key: key to search for
125 * @n: znode branch slot number is returned here
126 *
127 * This is a helper function which search branch with key @key in @znode using
128 * binary search. The result of the search may be:
129 * o exact match, then %1 is returned, and the slot number of the branch is
130 * stored in @n;
131 * o no exact match, then %0 is returned and the slot number of the left
132 * closest branch is returned in @n; the slot if all keys in this znode are
133 * greater than @key, then %-1 is returned in @n.
134 */
135int ubifs_search_zbranch(const struct ubifs_info *c,
136 const struct ubifs_znode *znode,
137 const union ubifs_key *key, int *n)
138{
139 int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
140 int uninitialized_var(cmp);
141 const struct ubifs_zbranch *zbr = &znode->zbranch[0];
142
143 ubifs_assert(end > beg);
144
145 while (end > beg) {
146 mid = (beg + end) >> 1;
147 cmp = keys_cmp(c, key, &zbr[mid].key);
148 if (cmp > 0)
149 beg = mid + 1;
150 else if (cmp < 0)
151 end = mid;
152 else {
153 *n = mid;
154 return 1;
155 }
156 }
157
158 *n = end - 1;
159
160 /* The insert point is after *n */
161 ubifs_assert(*n >= -1 && *n < znode->child_cnt);
162 if (*n == -1)
163 ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
164 else
165 ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
166 if (*n + 1 < znode->child_cnt)
167 ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
168
169 return 0;
170}
171
172/**
173 * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
174 * @znode: znode to start at (root of the sub-tree to traverse)
175 *
176 * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
177 * ignored.
178 */
179struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
180{
181 if (unlikely(!znode))
182 return NULL;
183
184 while (znode->level > 0) {
185 struct ubifs_znode *child;
186
187 child = ubifs_tnc_find_child(znode, 0);
188 if (!child)
189 return znode;
190 znode = child;
191 }
192
193 return znode;
194}
195
196/**
197 * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
198 * @znode: previous znode
199 *
200 * This function implements postorder TNC traversal. The LNC is ignored.
201 * Returns the next element or %NULL if @znode is already the last one.
202 */
203struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
204{
205 struct ubifs_znode *zn;
206
207 ubifs_assert(znode);
208 if (unlikely(!znode->parent))
209 return NULL;
210
211 /* Switch to the next index in the parent */
212 zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
213 if (!zn)
214 /* This is in fact the last child, return parent */
215 return znode->parent;
216
217 /* Go to the first znode in this new subtree */
218 return ubifs_tnc_postorder_first(zn);
219}
220
221/**
222 * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
223 * @znode: znode defining subtree to destroy
224 *
225 * This function destroys subtree of the TNC tree. Returns number of clean
226 * znodes in the subtree.
227 */
228long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
229{
230 struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
231 long clean_freed = 0;
232 int n;
233
234 ubifs_assert(zn);
235 while (1) {
236 for (n = 0; n < zn->child_cnt; n++) {
237 if (!zn->zbranch[n].znode)
238 continue;
239
240 if (zn->level > 0 &&
241 !ubifs_zn_dirty(zn->zbranch[n].znode))
242 clean_freed += 1;
243
244 cond_resched();
245 kfree(zn->zbranch[n].znode);
246 }
247
248 if (zn == znode) {
249 if (!ubifs_zn_dirty(zn))
250 clean_freed += 1;
251 kfree(zn);
252 return clean_freed;
253 }
254
255 zn = ubifs_tnc_postorder_next(zn);
256 }
257}
258
259/**
260 * read_znode - read an indexing node from flash and fill znode.
261 * @c: UBIFS file-system description object
262 * @lnum: LEB of the indexing node to read
263 * @offs: node offset
264 * @len: node length
265 * @znode: znode to read to
266 *
267 * This function reads an indexing node from the flash media and fills znode
268 * with the read data. Returns zero in case of success and a negative error
269 * code in case of failure. The read indexing node is validated and if anything
270 * is wrong with it, this function prints complaint messages and returns
271 * %-EINVAL.
272 */
273static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
274 struct ubifs_znode *znode)
275{
276 int i, err, type, cmp;
277 struct ubifs_idx_node *idx;
278
279 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
280 if (!idx)
281 return -ENOMEM;
282
283 err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
284 if (err < 0) {
285 kfree(idx);
286 return err;
287 }
288
289 znode->child_cnt = le16_to_cpu(idx->child_cnt);
290 znode->level = le16_to_cpu(idx->level);
291
292 dbg_tnc("LEB %d:%d, level %d, %d branch",
293 lnum, offs, znode->level, znode->child_cnt);
294
295 if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
296 dbg_err("current fanout %d, branch count %d",
297 c->fanout, znode->child_cnt);
298 dbg_err("max levels %d, znode level %d",
299 UBIFS_MAX_LEVELS, znode->level);
300 err = 1;
301 goto out_dump;
302 }
303
304 for (i = 0; i < znode->child_cnt; i++) {
305 const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
306 struct ubifs_zbranch *zbr = &znode->zbranch[i];
307
308 key_read(c, &br->key, &zbr->key);
309 zbr->lnum = le32_to_cpu(br->lnum);
310 zbr->offs = le32_to_cpu(br->offs);
311 zbr->len = le32_to_cpu(br->len);
312 zbr->znode = NULL;
313
314 /* Validate branch */
315
316 if (zbr->lnum < c->main_first ||
317 zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
318 zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
319 dbg_err("bad branch %d", i);
320 err = 2;
321 goto out_dump;
322 }
323
324 switch (key_type(c, &zbr->key)) {
325 case UBIFS_INO_KEY:
326 case UBIFS_DATA_KEY:
327 case UBIFS_DENT_KEY:
328 case UBIFS_XENT_KEY:
329 break;
330 default:
331 dbg_msg("bad key type at slot %d: %s", i,
332 DBGKEY(&zbr->key));
333 err = 3;
334 goto out_dump;
335 }
336
337 if (znode->level)
338 continue;
339
340 type = key_type(c, &zbr->key);
341 if (c->ranges[type].max_len == 0) {
342 if (zbr->len != c->ranges[type].len) {
343 dbg_err("bad target node (type %d) length (%d)",
344 type, zbr->len);
345 dbg_err("have to be %d", c->ranges[type].len);
346 err = 4;
347 goto out_dump;
348 }
349 } else if (zbr->len < c->ranges[type].min_len ||
350 zbr->len > c->ranges[type].max_len) {
351 dbg_err("bad target node (type %d) length (%d)",
352 type, zbr->len);
353 dbg_err("have to be in range of %d-%d",
354 c->ranges[type].min_len,
355 c->ranges[type].max_len);
356 err = 5;
357 goto out_dump;
358 }
359 }
360
361 /*
362 * Ensure that the next key is greater or equivalent to the
363 * previous one.
364 */
365 for (i = 0; i < znode->child_cnt - 1; i++) {
366 const union ubifs_key *key1, *key2;
367
368 key1 = &znode->zbranch[i].key;
369 key2 = &znode->zbranch[i + 1].key;
370
371 cmp = keys_cmp(c, key1, key2);
372 if (cmp > 0) {
373 dbg_err("bad key order (keys %d and %d)", i, i + 1);
374 err = 6;
375 goto out_dump;
376 } else if (cmp == 0 && !is_hash_key(c, key1)) {
377 /* These can only be keys with colliding hash */
378 dbg_err("keys %d and %d are not hashed but equivalent",
379 i, i + 1);
380 err = 7;
381 goto out_dump;
382 }
383 }
384
385 kfree(idx);
386 return 0;
387
388out_dump:
389 ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
390 dbg_dump_node(c, idx);
391 kfree(idx);
392 return -EINVAL;
393}
394
395/**
396 * ubifs_load_znode - load znode to TNC cache.
397 * @c: UBIFS file-system description object
398 * @zbr: znode branch
399 * @parent: znode's parent
400 * @iip: index in parent
401 *
402 * This function loads znode pointed to by @zbr into the TNC cache and
403 * returns pointer to it in case of success and a negative error code in case
404 * of failure.
405 */
406struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
407 struct ubifs_zbranch *zbr,
408 struct ubifs_znode *parent, int iip)
409{
410 int err;
411 struct ubifs_znode *znode;
412
413 ubifs_assert(!zbr->znode);
414 /*
415 * A slab cache is not presently used for znodes because the znode size
416 * depends on the fanout which is stored in the superblock.
417 */
418 znode = kzalloc(c->max_znode_sz, GFP_NOFS);
419 if (!znode)
420 return ERR_PTR(-ENOMEM);
421
422 err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
423 if (err)
424 goto out;
425
426 atomic_long_inc(&c->clean_zn_cnt);
427
428 /*
429 * Increment the global clean znode counter as well. It is OK that
430 * global and per-FS clean znode counters may be inconsistent for some
431 * short time (because we might be preempted at this point), the global
432 * one is only used in shrinker.
433 */
434 atomic_long_inc(&ubifs_clean_zn_cnt);
435
436 zbr->znode = znode;
437 znode->parent = parent;
438 znode->time = get_seconds();
439 znode->iip = iip;
440
441 return znode;
442
443out:
444 kfree(znode);
445 return ERR_PTR(err);
446}
447
448/**
449 * ubifs_tnc_read_node - read a leaf node from the flash media.
450 * @c: UBIFS file-system description object
451 * @zbr: key and position of the node
452 * @node: node is returned here
453 *
454 * This function reads a node defined by @zbr from the flash media. Returns
455 * zero in case of success or a negative negative error code in case of
456 * failure.
457 */
458int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
459 void *node)
460{
461 union ubifs_key key1, *key = &zbr->key;
462 int err, type = key_type(c, key);
463 struct ubifs_wbuf *wbuf;
464
465 /*
466 * 'zbr' has to point to on-flash node. The node may sit in a bud and
467 * may even be in a write buffer, so we have to take care about this.
468 */
469 wbuf = ubifs_get_wbuf(c, zbr->lnum);
470 if (wbuf)
471 err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
472 zbr->lnum, zbr->offs);
473 else
474 err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
475 zbr->offs);
476
477 if (err) {
478 dbg_tnc("key %s", DBGKEY(key));
479 return err;
480 }
481
482 /* Make sure the key of the read node is correct */
483 key_read(c, key, &key1);
484 if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
485 ubifs_err("bad key in node at LEB %d:%d",
486 zbr->lnum, zbr->offs);
487 dbg_tnc("looked for key %s found node's key %s",
488 DBGKEY(key), DBGKEY1(&key1));
489 dbg_dump_node(c, node);
490 return -EINVAL;
491 }
492
493 return 0;
494}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
new file mode 100644
index 000000000000..0cc7da9bed47
--- /dev/null
+++ b/fs/ubifs/ubifs-media.h
@@ -0,0 +1,745 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file describes UBIFS on-flash format and contains definitions of all the
25 * relevant data structures and constants.
26 *
27 * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
28 * with the UBIFS node magic number and have the same common header. Nodes
29 * always sit at 8-byte aligned positions on the media and node header sizes are
30 * also 8-byte aligned (except for the indexing node and the padding node).
31 */
32
33#ifndef __UBIFS_MEDIA_H__
34#define __UBIFS_MEDIA_H__
35
36/* UBIFS node magic number (must not have the padding byte first or last) */
37#define UBIFS_NODE_MAGIC 0x06101831
38
39/* UBIFS on-flash format version */
40#define UBIFS_FORMAT_VERSION 4
41
42/* Minimum logical eraseblock size in bytes */
43#define UBIFS_MIN_LEB_SZ (15*1024)
44
45/* Initial CRC32 value used when calculating CRC checksums */
46#define UBIFS_CRC32_INIT 0xFFFFFFFFU
47
48/*
49 * UBIFS does not try to compress data if its length is less than the below
50 * constant.
51 */
52#define UBIFS_MIN_COMPR_LEN 128
53
54/* Root inode number */
55#define UBIFS_ROOT_INO 1
56
57/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
58#define UBIFS_FIRST_INO 64
59
60/*
61 * Maximum file name and extended attribute length (must be a multiple of 8,
62 * minus 1).
63 */
64#define UBIFS_MAX_NLEN 255
65
66/* Maximum number of data journal heads */
67#define UBIFS_MAX_JHEADS 1
68
69/*
70 * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
71 * which means that it does not treat the underlying media as consisting of
72 * blocks like in case of hard drives. Do not be confused. UBIFS block is just
73 * the maximum amount of data which one data node can have or which can be
74 * attached to an inode node.
75 */
76#define UBIFS_BLOCK_SIZE 4096
77#define UBIFS_BLOCK_SHIFT 12
78#define UBIFS_BLOCK_MASK 0x00000FFF
79
80/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
81#define UBIFS_PADDING_BYTE 0xCE
82
83/* Maximum possible key length */
84#define UBIFS_MAX_KEY_LEN 16
85
86/* Key length ("simple" format) */
87#define UBIFS_SK_LEN 8
88
89/* Minimum index tree fanout */
90#define UBIFS_MIN_FANOUT 2
91
92/* Maximum number of levels in UBIFS indexing B-tree */
93#define UBIFS_MAX_LEVELS 512
94
95/* Maximum amount of data attached to an inode in bytes */
96#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
97
98/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
99#define UBIFS_LPT_FANOUT 4
100#define UBIFS_LPT_FANOUT_SHIFT 2
101
102/* LEB Properties Tree bit field sizes */
103#define UBIFS_LPT_CRC_BITS 16
104#define UBIFS_LPT_CRC_BYTES 2
105#define UBIFS_LPT_TYPE_BITS 4
106
107/* The key is always at the same position in all keyed nodes */
108#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
109
110/*
111 * LEB Properties Tree node types.
112 *
113 * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
114 * UBIFS_LPT_NNODE: LPT internal node
115 * UBIFS_LPT_LTAB: LPT's own lprops table
116 * UBIFS_LPT_LSAVE: LPT's save table (big model only)
117 * UBIFS_LPT_NODE_CNT: count of LPT node types
118 * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
119 */
120enum {
121 UBIFS_LPT_PNODE,
122 UBIFS_LPT_NNODE,
123 UBIFS_LPT_LTAB,
124 UBIFS_LPT_LSAVE,
125 UBIFS_LPT_NODE_CNT,
126 UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
127};
128
129/*
130 * UBIFS inode types.
131 *
132 * UBIFS_ITYPE_REG: regular file
133 * UBIFS_ITYPE_DIR: directory
134 * UBIFS_ITYPE_LNK: soft link
135 * UBIFS_ITYPE_BLK: block device node
136 * UBIFS_ITYPE_CHR: character device node
137 * UBIFS_ITYPE_FIFO: fifo
138 * UBIFS_ITYPE_SOCK: socket
139 * UBIFS_ITYPES_CNT: count of supported file types
140 */
141enum {
142 UBIFS_ITYPE_REG,
143 UBIFS_ITYPE_DIR,
144 UBIFS_ITYPE_LNK,
145 UBIFS_ITYPE_BLK,
146 UBIFS_ITYPE_CHR,
147 UBIFS_ITYPE_FIFO,
148 UBIFS_ITYPE_SOCK,
149 UBIFS_ITYPES_CNT,
150};
151
152/*
153 * Supported key hash functions.
154 *
155 * UBIFS_KEY_HASH_R5: R5 hash
156 * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
157 */
158enum {
159 UBIFS_KEY_HASH_R5,
160 UBIFS_KEY_HASH_TEST,
161};
162
163/*
164 * Supported key formats.
165 *
166 * UBIFS_SIMPLE_KEY_FMT: simple key format
167 */
168enum {
169 UBIFS_SIMPLE_KEY_FMT,
170};
171
172/*
173 * The simple key format uses 29 bits for storing UBIFS block number and hash
174 * value.
175 */
176#define UBIFS_S_KEY_BLOCK_BITS 29
177#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
178#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS
179#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK
180
181/*
182 * Key types.
183 *
184 * UBIFS_INO_KEY: inode node key
185 * UBIFS_DATA_KEY: data node key
186 * UBIFS_DENT_KEY: directory entry node key
187 * UBIFS_XENT_KEY: extended attribute entry key
188 * UBIFS_KEY_TYPES_CNT: number of supported key types
189 */
190enum {
191 UBIFS_INO_KEY,
192 UBIFS_DATA_KEY,
193 UBIFS_DENT_KEY,
194 UBIFS_XENT_KEY,
195 UBIFS_KEY_TYPES_CNT,
196};
197
198/* Count of LEBs reserved for the superblock area */
199#define UBIFS_SB_LEBS 1
200/* Count of LEBs reserved for the master area */
201#define UBIFS_MST_LEBS 2
202
203/* First LEB of the superblock area */
204#define UBIFS_SB_LNUM 0
205/* First LEB of the master area */
206#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
207/* First LEB of the log area */
208#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
209
210/*
211 * The below constants define the absolute minimum values for various UBIFS
212 * media areas. Many of them actually depend of flash geometry and the FS
213 * configuration (number of journal heads, orphan LEBs, etc). This means that
214 * the smallest volume size which can be used for UBIFS cannot be pre-defined
215 * by these constants. The file-system that meets the below limitation will not
216 * necessarily mount. UBIFS does run-time calculations and validates the FS
217 * size.
218 */
219
220/* Minimum number of logical eraseblocks in the log */
221#define UBIFS_MIN_LOG_LEBS 2
222/* Minimum number of bud logical eraseblocks (one for each head) */
223#define UBIFS_MIN_BUD_LEBS 3
224/* Minimum number of journal logical eraseblocks */
225#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
226/* Minimum number of LPT area logical eraseblocks */
227#define UBIFS_MIN_LPT_LEBS 2
228/* Minimum number of orphan area logical eraseblocks */
229#define UBIFS_MIN_ORPH_LEBS 1
230/*
231 * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
232 * for GC, 1 for deletions, and at least 1 for committed data).
233 */
234#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
235
236/* Minimum number of logical eraseblocks */
237#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
238 UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
239 UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
240
241/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
242#define UBIFS_CH_SZ sizeof(struct ubifs_ch)
243#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node)
244#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
245#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
246#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
247#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node)
248#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node)
249#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node)
250#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node)
251#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node)
252#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node)
253#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
254/* Extended attribute entry nodes are identical to directory entry nodes */
255#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
256/* Only this does not have to be multiple of 8 bytes */
257#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch)
258
259/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
260#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
261#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
262#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
263#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ
264
265/* The largest UBIFS node */
266#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
267
268/*
269 * On-flash inode flags.
270 *
271 * UBIFS_COMPR_FL: use compression for this inode
272 * UBIFS_SYNC_FL: I/O on this inode has to be synchronous
273 * UBIFS_IMMUTABLE_FL: inode is immutable
274 * UBIFS_APPEND_FL: writes to the inode may only append data
275 * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
276 * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
277 *
278 * Note, these are on-flash flags which correspond to ioctl flags
279 * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
280 * have to be the same.
281 */
282enum {
283 UBIFS_COMPR_FL = 0x01,
284 UBIFS_SYNC_FL = 0x02,
285 UBIFS_IMMUTABLE_FL = 0x04,
286 UBIFS_APPEND_FL = 0x08,
287 UBIFS_DIRSYNC_FL = 0x10,
288 UBIFS_XATTR_FL = 0x20,
289};
290
291/* Inode flag bits used by UBIFS */
292#define UBIFS_FL_MASK 0x0000001F
293
294/*
295 * UBIFS compression algorithms.
296 *
297 * UBIFS_COMPR_NONE: no compression
298 * UBIFS_COMPR_LZO: LZO compression
299 * UBIFS_COMPR_ZLIB: ZLIB compression
300 * UBIFS_COMPR_TYPES_CNT: count of supported compression types
301 */
302enum {
303 UBIFS_COMPR_NONE,
304 UBIFS_COMPR_LZO,
305 UBIFS_COMPR_ZLIB,
306 UBIFS_COMPR_TYPES_CNT,
307};
308
309/*
310 * UBIFS node types.
311 *
312 * UBIFS_INO_NODE: inode node
313 * UBIFS_DATA_NODE: data node
314 * UBIFS_DENT_NODE: directory entry node
315 * UBIFS_XENT_NODE: extended attribute node
316 * UBIFS_TRUN_NODE: truncation node
317 * UBIFS_PAD_NODE: padding node
318 * UBIFS_SB_NODE: superblock node
319 * UBIFS_MST_NODE: master node
320 * UBIFS_REF_NODE: LEB reference node
321 * UBIFS_IDX_NODE: index node
322 * UBIFS_CS_NODE: commit start node
323 * UBIFS_ORPH_NODE: orphan node
324 * UBIFS_NODE_TYPES_CNT: count of supported node types
325 *
326 * Note, we index arrays by these numbers, so keep them low and contiguous.
327 * Node type constants for inodes, direntries and so on have to be the same as
328 * corresponding key type constants.
329 */
330enum {
331 UBIFS_INO_NODE,
332 UBIFS_DATA_NODE,
333 UBIFS_DENT_NODE,
334 UBIFS_XENT_NODE,
335 UBIFS_TRUN_NODE,
336 UBIFS_PAD_NODE,
337 UBIFS_SB_NODE,
338 UBIFS_MST_NODE,
339 UBIFS_REF_NODE,
340 UBIFS_IDX_NODE,
341 UBIFS_CS_NODE,
342 UBIFS_ORPH_NODE,
343 UBIFS_NODE_TYPES_CNT,
344};
345
346/*
347 * Master node flags.
348 *
349 * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
350 * UBIFS_MST_NO_ORPHS: no orphan inodes present
351 * UBIFS_MST_RCVRY: written by recovery
352 */
353enum {
354 UBIFS_MST_DIRTY = 1,
355 UBIFS_MST_NO_ORPHS = 2,
356 UBIFS_MST_RCVRY = 4,
357};
358
359/*
360 * Node group type (used by recovery to recover whole group or none).
361 *
362 * UBIFS_NO_NODE_GROUP: this node is not part of a group
363 * UBIFS_IN_NODE_GROUP: this node is a part of a group
364 * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
365 */
366enum {
367 UBIFS_NO_NODE_GROUP = 0,
368 UBIFS_IN_NODE_GROUP,
369 UBIFS_LAST_OF_NODE_GROUP,
370};
371
372/*
373 * Superblock flags.
374 *
375 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
376 */
377enum {
378 UBIFS_FLG_BIGLPT = 0x02,
379};
380
381/**
382 * struct ubifs_ch - common header node.
383 * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
384 * @crc: CRC-32 checksum of the node header
385 * @sqnum: sequence number
386 * @len: full node length
387 * @node_type: node type
388 * @group_type: node group type
389 * @padding: reserved for future, zeroes
390 *
391 * Every UBIFS node starts with this common part. If the node has a key, the
392 * key always goes next.
393 */
394struct ubifs_ch {
395 __le32 magic;
396 __le32 crc;
397 __le64 sqnum;
398 __le32 len;
399 __u8 node_type;
400 __u8 group_type;
401 __u8 padding[2];
402} __attribute__ ((packed));
403
404/**
405 * union ubifs_dev_desc - device node descriptor.
406 * @new: new type device descriptor
407 * @huge: huge type device descriptor
408 *
409 * This data structure describes major/minor numbers of a device node. In an
410 * inode is a device node then its data contains an object of this type. UBIFS
411 * uses standard Linux "new" and "huge" device node encodings.
412 */
413union ubifs_dev_desc {
414 __le32 new;
415 __le64 huge;
416} __attribute__ ((packed));
417
418/**
419 * struct ubifs_ino_node - inode node.
420 * @ch: common header
421 * @key: node key
422 * @creat_sqnum: sequence number at time of creation
423 * @size: inode size in bytes (amount of uncompressed data)
424 * @atime_sec: access time seconds
425 * @ctime_sec: creation time seconds
426 * @mtime_sec: modification time seconds
427 * @atime_nsec: access time nanoseconds
428 * @ctime_nsec: creation time nanoseconds
429 * @mtime_nsec: modification time nanoseconds
430 * @nlink: number of hard links
431 * @uid: owner ID
432 * @gid: group ID
433 * @mode: access flags
434 * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
435 * @data_len: inode data length
436 * @xattr_cnt: count of extended attributes this inode has
437 * @xattr_size: summarized size of all extended attributes in bytes
438 * @padding1: reserved for future, zeroes
439 * @xattr_names: sum of lengths of all extended attribute names belonging to
440 * this inode
441 * @compr_type: compression type used for this inode
442 * @padding2: reserved for future, zeroes
443 * @data: data attached to the inode
444 *
445 * Note, even though inode compression type is defined by @compr_type, some
446 * nodes of this inode may be compressed with different compressor - this
447 * happens if compression type is changed while the inode already has data
448 * nodes. But @compr_type will be use for further writes to the inode.
449 *
450 * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
451 * the padding fields.
452 */
453struct ubifs_ino_node {
454 struct ubifs_ch ch;
455 __u8 key[UBIFS_MAX_KEY_LEN];
456 __le64 creat_sqnum;
457 __le64 size;
458 __le64 atime_sec;
459 __le64 ctime_sec;
460 __le64 mtime_sec;
461 __le32 atime_nsec;
462 __le32 ctime_nsec;
463 __le32 mtime_nsec;
464 __le32 nlink;
465 __le32 uid;
466 __le32 gid;
467 __le32 mode;
468 __le32 flags;
469 __le32 data_len;
470 __le32 xattr_cnt;
471 __le32 xattr_size;
472 __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
473 __le32 xattr_names;
474 __le16 compr_type;
475 __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
476 __u8 data[];
477} __attribute__ ((packed));
478
479/**
480 * struct ubifs_dent_node - directory entry node.
481 * @ch: common header
482 * @key: node key
483 * @inum: target inode number
484 * @padding1: reserved for future, zeroes
485 * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
486 * @nlen: name length
487 * @padding2: reserved for future, zeroes
488 * @name: zero-terminated name
489 *
490 * Note, do not forget to amend 'zero_dent_node_unused()' function when
491 * changing the padding fields.
492 */
493struct ubifs_dent_node {
494 struct ubifs_ch ch;
495 __u8 key[UBIFS_MAX_KEY_LEN];
496 __le64 inum;
497 __u8 padding1;
498 __u8 type;
499 __le16 nlen;
500 __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
501 __u8 name[];
502} __attribute__ ((packed));
503
504/**
505 * struct ubifs_data_node - data node.
506 * @ch: common header
507 * @key: node key
508 * @size: uncompressed data size in bytes
509 * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
510 * @padding: reserved for future, zeroes
511 * @data: data
512 *
513 * Note, do not forget to amend 'zero_data_node_unused()' function when
514 * changing the padding fields.
515 */
516struct ubifs_data_node {
517 struct ubifs_ch ch;
518 __u8 key[UBIFS_MAX_KEY_LEN];
519 __le32 size;
520 __le16 compr_type;
521 __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
522 __u8 data[];
523} __attribute__ ((packed));
524
525/**
526 * struct ubifs_trun_node - truncation node.
527 * @ch: common header
528 * @inum: truncated inode number
529 * @padding: reserved for future, zeroes
530 * @old_size: size before truncation
531 * @new_size: size after truncation
532 *
533 * This node exists only in the journal and never goes to the main area. Note,
534 * do not forget to amend 'zero_trun_node_unused()' function when changing the
535 * padding fields.
536 */
537struct ubifs_trun_node {
538 struct ubifs_ch ch;
539 __le32 inum;
540 __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
541 __le64 old_size;
542 __le64 new_size;
543} __attribute__ ((packed));
544
545/**
546 * struct ubifs_pad_node - padding node.
547 * @ch: common header
548 * @pad_len: how many bytes after this node are unused (because padded)
549 * @padding: reserved for future, zeroes
550 */
551struct ubifs_pad_node {
552 struct ubifs_ch ch;
553 __le32 pad_len;
554} __attribute__ ((packed));
555
556/**
557 * struct ubifs_sb_node - superblock node.
558 * @ch: common header
559 * @padding: reserved for future, zeroes
560 * @key_hash: type of hash function used in keys
561 * @key_fmt: format of the key
562 * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
563 * @min_io_size: minimal input/output unit size
564 * @leb_size: logical eraseblock size in bytes
565 * @leb_cnt: count of LEBs used by file-system
566 * @max_leb_cnt: maximum count of LEBs used by file-system
567 * @max_bud_bytes: maximum amount of data stored in buds
568 * @log_lebs: log size in logical eraseblocks
569 * @lpt_lebs: number of LEBs used for lprops table
570 * @orph_lebs: number of LEBs used for recording orphans
571 * @jhead_cnt: count of journal heads
572 * @fanout: tree fanout (max. number of links per indexing node)
573 * @lsave_cnt: number of LEB numbers in LPT's save table
574 * @fmt_version: UBIFS on-flash format version
575 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
576 * @padding1: reserved for future, zeroes
577 * @rp_uid: reserve pool UID
578 * @rp_gid: reserve pool GID
579 * @rp_size: size of the reserved pool in bytes
580 * @padding2: reserved for future, zeroes
581 * @time_gran: time granularity in nanoseconds
582 * @uuid: UUID generated when the file system image was created
583 */
584struct ubifs_sb_node {
585 struct ubifs_ch ch;
586 __u8 padding[2];
587 __u8 key_hash;
588 __u8 key_fmt;
589 __le32 flags;
590 __le32 min_io_size;
591 __le32 leb_size;
592 __le32 leb_cnt;
593 __le32 max_leb_cnt;
594 __le64 max_bud_bytes;
595 __le32 log_lebs;
596 __le32 lpt_lebs;
597 __le32 orph_lebs;
598 __le32 jhead_cnt;
599 __le32 fanout;
600 __le32 lsave_cnt;
601 __le32 fmt_version;
602 __le16 default_compr;
603 __u8 padding1[2];
604 __le32 rp_uid;
605 __le32 rp_gid;
606 __le64 rp_size;
607 __le32 time_gran;
608 __u8 uuid[16];
609 __u8 padding2[3972];
610} __attribute__ ((packed));
611
612/**
613 * struct ubifs_mst_node - master node.
614 * @ch: common header
615 * @highest_inum: highest inode number in the committed index
616 * @cmt_no: commit number
617 * @flags: various flags (%UBIFS_MST_DIRTY, etc)
618 * @log_lnum: start of the log
619 * @root_lnum: LEB number of the root indexing node
620 * @root_offs: offset within @root_lnum
621 * @root_len: root indexing node length
622 * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
623 * not reserved and should be reserved on mount)
624 * @ihead_lnum: LEB number of index head
625 * @ihead_offs: offset of index head
626 * @index_size: size of index on flash
627 * @total_free: total free space in bytes
628 * @total_dirty: total dirty space in bytes
629 * @total_used: total used space in bytes (includes only data LEBs)
630 * @total_dead: total dead space in bytes (includes only data LEBs)
631 * @total_dark: total dark space in bytes (includes only data LEBs)
632 * @lpt_lnum: LEB number of LPT root nnode
633 * @lpt_offs: offset of LPT root nnode
634 * @nhead_lnum: LEB number of LPT head
635 * @nhead_offs: offset of LPT head
636 * @ltab_lnum: LEB number of LPT's own lprops table
637 * @ltab_offs: offset of LPT's own lprops table
638 * @lsave_lnum: LEB number of LPT's save table (big model only)
639 * @lsave_offs: offset of LPT's save table (big model only)
640 * @lscan_lnum: LEB number of last LPT scan
641 * @empty_lebs: number of empty logical eraseblocks
642 * @idx_lebs: number of indexing logical eraseblocks
643 * @leb_cnt: count of LEBs used by file-system
644 * @padding: reserved for future, zeroes
645 */
646struct ubifs_mst_node {
647 struct ubifs_ch ch;
648 __le64 highest_inum;
649 __le64 cmt_no;
650 __le32 flags;
651 __le32 log_lnum;
652 __le32 root_lnum;
653 __le32 root_offs;
654 __le32 root_len;
655 __le32 gc_lnum;
656 __le32 ihead_lnum;
657 __le32 ihead_offs;
658 __le64 index_size;
659 __le64 total_free;
660 __le64 total_dirty;
661 __le64 total_used;
662 __le64 total_dead;
663 __le64 total_dark;
664 __le32 lpt_lnum;
665 __le32 lpt_offs;
666 __le32 nhead_lnum;
667 __le32 nhead_offs;
668 __le32 ltab_lnum;
669 __le32 ltab_offs;
670 __le32 lsave_lnum;
671 __le32 lsave_offs;
672 __le32 lscan_lnum;
673 __le32 empty_lebs;
674 __le32 idx_lebs;
675 __le32 leb_cnt;
676 __u8 padding[344];
677} __attribute__ ((packed));
678
679/**
680 * struct ubifs_ref_node - logical eraseblock reference node.
681 * @ch: common header
682 * @lnum: the referred logical eraseblock number
683 * @offs: start offset in the referred LEB
684 * @jhead: journal head number
685 * @padding: reserved for future, zeroes
686 */
687struct ubifs_ref_node {
688 struct ubifs_ch ch;
689 __le32 lnum;
690 __le32 offs;
691 __le32 jhead;
692 __u8 padding[28];
693} __attribute__ ((packed));
694
695/**
696 * struct ubifs_branch - key/reference/length branch
697 * @lnum: LEB number of the target node
698 * @offs: offset within @lnum
699 * @len: target node length
700 * @key: key
701 */
702struct ubifs_branch {
703 __le32 lnum;
704 __le32 offs;
705 __le32 len;
706 __u8 key[];
707} __attribute__ ((packed));
708
709/**
710 * struct ubifs_idx_node - indexing node.
711 * @ch: common header
712 * @child_cnt: number of child index nodes
713 * @level: tree level
714 * @branches: LEB number / offset / length / key branches
715 */
716struct ubifs_idx_node {
717 struct ubifs_ch ch;
718 __le16 child_cnt;
719 __le16 level;
720 __u8 branches[];
721} __attribute__ ((packed));
722
723/**
724 * struct ubifs_cs_node - commit start node.
725 * @ch: common header
726 * @cmt_no: commit number
727 */
728struct ubifs_cs_node {
729 struct ubifs_ch ch;
730 __le64 cmt_no;
731} __attribute__ ((packed));
732
733/**
734 * struct ubifs_orph_node - orphan node.
735 * @ch: common header
736 * @cmt_no: commit number (also top bit is set on the last node of the commit)
737 * @inos: inode numbers of orphans
738 */
739struct ubifs_orph_node {
740 struct ubifs_ch ch;
741 __le64 cmt_no;
742 __le64 inos[];
743} __attribute__ ((packed));
744
745#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
new file mode 100644
index 000000000000..e4f89f271827
--- /dev/null
+++ b/fs/ubifs/ubifs.h
@@ -0,0 +1,1649 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/* Implementation version 0.7 */
24
25#ifndef __UBIFS_H__
26#define __UBIFS_H__
27
28#include <asm/div64.h>
29#include <linux/statfs.h>
30#include <linux/fs.h>
31#include <linux/err.h>
32#include <linux/sched.h>
33#include <linux/vmalloc.h>
34#include <linux/spinlock.h>
35#include <linux/mutex.h>
36#include <linux/rwsem.h>
37#include <linux/mtd/ubi.h>
38#include <linux/pagemap.h>
39#include <linux/backing-dev.h>
40#include "ubifs-media.h"
41
42/* Version of this UBIFS implementation */
43#define UBIFS_VERSION 1
44
45/* Normal UBIFS messages */
46#define ubifs_msg(fmt, ...) \
47 printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
48/* UBIFS error messages */
49#define ubifs_err(fmt, ...) \
50 printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
51 __func__, ##__VA_ARGS__)
52/* UBIFS warning messages */
53#define ubifs_warn(fmt, ...) \
54 printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
55 current->pid, __func__, ##__VA_ARGS__)
56
57/* UBIFS file system VFS magic number */
58#define UBIFS_SUPER_MAGIC 0x24051905
59
60/* Number of UBIFS blocks per VFS page */
61#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
62#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
63
64/* "File system end of life" sequence number watermark */
65#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
66#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL
67
68/* Minimum amount of data UBIFS writes to the flash */
69#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
70
71/*
72 * Currently we do not support inode number overlapping and re-using, so this
73 * watermark defines dangerous inode number level. This should be fixed later,
74 * although it is difficult to exceed current limit. Another option is to use
75 * 64-bit inode numbers, but this means more overhead.
76 */
77#define INUM_WARN_WATERMARK 0xFFF00000
78#define INUM_WATERMARK 0xFFFFFF00
79
80/* Largest key size supported in this implementation */
81#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
82
83/* Maximum number of entries in each LPT (LEB category) heap */
84#define LPT_HEAP_SZ 256
85
86/*
87 * Background thread name pattern. The numbers are UBI device and volume
88 * numbers.
89 */
90#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
91
92/* Default write-buffer synchronization timeout (5 secs) */
93#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
94
95/* Maximum possible inode number (only 32-bit inodes are supported now) */
96#define MAX_INUM 0xFFFFFFFF
97
98/* Number of non-data journal heads */
99#define NONDATA_JHEADS_CNT 2
100
101/* Garbage collector head */
102#define GCHD 0
103/* Base journal head number */
104#define BASEHD 1
105/* First "general purpose" journal head */
106#define DATAHD 2
107
108/* 'No change' value for 'ubifs_change_lp()' */
109#define LPROPS_NC 0x80000001
110
111/*
112 * There is no notion of truncation key because truncation nodes do not exist
113 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
114 * keys for truncation nodes because the code becomes simpler. So we define
115 * %UBIFS_TRUN_KEY type.
116 */
117#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
118
119/*
120 * How much a directory entry/extended attribute entry adds to the parent/host
121 * inode.
122 */
123#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
124
125/* How much an extended attribute adds to the host inode */
126#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
127
128/*
129 * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
130 * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
131 * considered "young". This is used by shrinker when selecting znode to trim
132 * off.
133 */
134#define OLD_ZNODE_AGE 20
135#define YOUNG_ZNODE_AGE 5
136
137/*
138 * Some compressors, like LZO, may end up with more data then the input buffer.
139 * So UBIFS always allocates larger output buffer, to be sure the compressor
140 * will not corrupt memory in case of worst case compression.
141 */
142#define WORST_COMPR_FACTOR 2
143
144/* Maximum expected tree height for use by bottom_up_buf */
145#define BOTTOM_UP_HEIGHT 64
146
147/*
148 * Lockdep classes for UBIFS inode @ui_mutex.
149 */
150enum {
151 WB_MUTEX_1 = 0,
152 WB_MUTEX_2 = 1,
153 WB_MUTEX_3 = 2,
154};
155
156/*
157 * Znode flags (actually, bit numbers which store the flags).
158 *
159 * DIRTY_ZNODE: znode is dirty
160 * COW_ZNODE: znode is being committed and a new instance of this znode has to
161 * be created before changing this znode
162 * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
163 * still in the commit list and the ongoing commit operation
164 * will commit it, and delete this znode after it is done
165 */
166enum {
167 DIRTY_ZNODE = 0,
168 COW_ZNODE = 1,
169 OBSOLETE_ZNODE = 2,
170};
171
172/*
173 * Commit states.
174 *
175 * COMMIT_RESTING: commit is not wanted
176 * COMMIT_BACKGROUND: background commit has been requested
177 * COMMIT_REQUIRED: commit is required
178 * COMMIT_RUNNING_BACKGROUND: background commit is running
179 * COMMIT_RUNNING_REQUIRED: commit is running and it is required
180 * COMMIT_BROKEN: commit failed
181 */
182enum {
183 COMMIT_RESTING = 0,
184 COMMIT_BACKGROUND,
185 COMMIT_REQUIRED,
186 COMMIT_RUNNING_BACKGROUND,
187 COMMIT_RUNNING_REQUIRED,
188 COMMIT_BROKEN,
189};
190
191/*
192 * 'ubifs_scan_a_node()' return values.
193 *
194 * SCANNED_GARBAGE: scanned garbage
195 * SCANNED_EMPTY_SPACE: scanned empty space
196 * SCANNED_A_NODE: scanned a valid node
197 * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
198 * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
199 *
200 * Greater than zero means: 'scanned that number of padding bytes'
201 */
202enum {
203 SCANNED_GARBAGE = 0,
204 SCANNED_EMPTY_SPACE = -1,
205 SCANNED_A_NODE = -2,
206 SCANNED_A_CORRUPT_NODE = -3,
207 SCANNED_A_BAD_PAD_NODE = -4,
208};
209
210/*
211 * LPT cnode flag bits.
212 *
213 * DIRTY_CNODE: cnode is dirty
214 * COW_CNODE: cnode is being committed and must be copied before writing
215 * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
216 * so it can (and must) be freed when the commit is finished
217 */
218enum {
219 DIRTY_CNODE = 0,
220 COW_CNODE = 1,
221 OBSOLETE_CNODE = 2,
222};
223
224/*
225 * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
226 *
227 * LTAB_DIRTY: ltab node is dirty
228 * LSAVE_DIRTY: lsave node is dirty
229 */
230enum {
231 LTAB_DIRTY = 1,
232 LSAVE_DIRTY = 2,
233};
234
235/*
236 * Return codes used by the garbage collector.
237 * @LEB_FREED: the logical eraseblock was freed and is ready to use
238 * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
239 * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
240 */
241enum {
242 LEB_FREED,
243 LEB_FREED_IDX,
244 LEB_RETAINED,
245};
246
247/**
248 * struct ubifs_old_idx - index node obsoleted since last commit start.
249 * @rb: rb-tree node
250 * @lnum: LEB number of obsoleted index node
251 * @offs: offset of obsoleted index node
252 */
253struct ubifs_old_idx {
254 struct rb_node rb;
255 int lnum;
256 int offs;
257};
258
259/* The below union makes it easier to deal with keys */
260union ubifs_key {
261 uint8_t u8[CUR_MAX_KEY_LEN];
262 uint32_t u32[CUR_MAX_KEY_LEN/4];
263 uint64_t u64[CUR_MAX_KEY_LEN/8];
264 __le32 j32[CUR_MAX_KEY_LEN/4];
265};
266
267/**
268 * struct ubifs_scan_node - UBIFS scanned node information.
269 * @list: list of scanned nodes
270 * @key: key of node scanned (if it has one)
271 * @sqnum: sequence number
272 * @type: type of node scanned
273 * @offs: offset with LEB of node scanned
274 * @len: length of node scanned
275 * @node: raw node
276 */
277struct ubifs_scan_node {
278 struct list_head list;
279 union ubifs_key key;
280 unsigned long long sqnum;
281 int type;
282 int offs;
283 int len;
284 void *node;
285};
286
287/**
288 * struct ubifs_scan_leb - UBIFS scanned LEB information.
289 * @lnum: logical eraseblock number
290 * @nodes_cnt: number of nodes scanned
291 * @nodes: list of struct ubifs_scan_node
292 * @endpt: end point (and therefore the start of empty space)
293 * @ecc: read returned -EBADMSG
294 * @buf: buffer containing entire LEB scanned
295 */
296struct ubifs_scan_leb {
297 int lnum;
298 int nodes_cnt;
299 struct list_head nodes;
300 int endpt;
301 int ecc;
302 void *buf;
303};
304
305/**
306 * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
307 * @list: list
308 * @lnum: LEB number
309 * @unmap: OK to unmap this LEB
310 *
311 * This data structure is used to temporary store garbage-collected indexing
312 * LEBs - they are not released immediately, but only after the next commit.
313 * This is needed to guarantee recoverability.
314 */
315struct ubifs_gced_idx_leb {
316 struct list_head list;
317 int lnum;
318 int unmap;
319};
320
321/**
322 * struct ubifs_inode - UBIFS in-memory inode description.
323 * @vfs_inode: VFS inode description object
324 * @creat_sqnum: sequence number at time of creation
325 * @xattr_size: summarized size of all extended attributes in bytes
326 * @xattr_cnt: count of extended attributes this inode has
327 * @xattr_names: sum of lengths of all extended attribute names belonging to
328 * this inode
329 * @dirty: non-zero if the inode is dirty
330 * @xattr: non-zero if this is an extended attribute inode
331 * @ui_mutex: serializes inode write-back with the rest of VFS operations,
332 * serializes "clean <-> dirty" state changes, protects @dirty,
333 * @ui_size, and @xattr_size
334 * @ui_lock: protects @synced_i_size
335 * @synced_i_size: synchronized size of inode, i.e. the value of inode size
336 * currently stored on the flash; used only for regular file
337 * inodes
338 * @ui_size: inode size used by UBIFS when writing to flash
339 * @flags: inode flags (@UBIFS_COMPR_FL, etc)
340 * @compr_type: default compression type used for this inode
341 * @data_len: length of the data attached to the inode
342 * @data: inode's data
343 *
344 * @ui_mutex exists for two main reasons. At first it prevents inodes from
345 * being written back while UBIFS changing them, being in the middle of an VFS
346 * operation. This way UBIFS makes sure the inode fields are consistent. For
347 * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
348 * write-back must not write any of them before we have finished.
349 *
350 * The second reason is budgeting - UBIFS has to budget all operations. If an
351 * operation is going to mark an inode dirty, it has to allocate budget for
352 * this. It cannot just mark it dirty because there is no guarantee there will
353 * be enough flash space to write the inode back later. This means UBIFS has
354 * to have full control over inode "clean <-> dirty" transitions (and pages
355 * actually). But unfortunately, VFS marks inodes dirty in many places, and it
356 * does not ask the file-system if it is allowed to do so (there is a notifier,
357 * but it is not enough), i.e., there is no mechanism to synchronize with this.
358 * So UBIFS has its own inode dirty flag and its own mutex to serialize
359 * "clean <-> dirty" transitions.
360 *
361 * The @synced_i_size field is used to make sure we never write pages which are
362 * beyond last synchronized inode size. See 'ubifs_writepage()' for more
363 * information.
364 *
365 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
366 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
367 * make sure @inode->i_size is always changed under @ui_mutex, because it
368 * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
369 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
370 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
371 * could consider to rework locking and base it on "shadow" fields.
372 */
373struct ubifs_inode {
374 struct inode vfs_inode;
375 unsigned long long creat_sqnum;
376 unsigned int xattr_size;
377 unsigned int xattr_cnt;
378 unsigned int xattr_names;
379 unsigned int dirty:1;
380 unsigned int xattr:1;
381 struct mutex ui_mutex;
382 spinlock_t ui_lock;
383 loff_t synced_i_size;
384 loff_t ui_size;
385 int flags;
386 int compr_type;
387 int data_len;
388 void *data;
389};
390
391/**
392 * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
393 * @list: list
394 * @lnum: LEB number of recovered LEB
395 * @endpt: offset where recovery ended
396 *
397 * This structure records a LEB identified during recovery that needs to be
398 * cleaned but was not because UBIFS was mounted read-only. The information
399 * is used to clean the LEB when remounting to read-write mode.
400 */
401struct ubifs_unclean_leb {
402 struct list_head list;
403 int lnum;
404 int endpt;
405};
406
407/*
408 * LEB properties flags.
409 *
410 * LPROPS_UNCAT: not categorized
411 * LPROPS_DIRTY: dirty > 0, not index
412 * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
413 * LPROPS_FREE: free > 0, not empty, not index
414 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
415 * LPROPS_EMPTY: LEB is empty, not taken
416 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
417 * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
418 * LPROPS_CAT_MASK: mask for the LEB categories above
419 * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
420 * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
421 */
422enum {
423 LPROPS_UNCAT = 0,
424 LPROPS_DIRTY = 1,
425 LPROPS_DIRTY_IDX = 2,
426 LPROPS_FREE = 3,
427 LPROPS_HEAP_CNT = 3,
428 LPROPS_EMPTY = 4,
429 LPROPS_FREEABLE = 5,
430 LPROPS_FRDI_IDX = 6,
431 LPROPS_CAT_MASK = 15,
432 LPROPS_TAKEN = 16,
433 LPROPS_INDEX = 32,
434};
435
436/**
437 * struct ubifs_lprops - logical eraseblock properties.
438 * @free: amount of free space in bytes
439 * @dirty: amount of dirty space in bytes
440 * @flags: LEB properties flags (see above)
441 * @lnum: LEB number
442 * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
443 * @hpos: heap position in heap of same-category lprops (other categories)
444 */
445struct ubifs_lprops {
446 int free;
447 int dirty;
448 int flags;
449 int lnum;
450 union {
451 struct list_head list;
452 int hpos;
453 };
454};
455
456/**
457 * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
458 * @free: amount of free space in bytes
459 * @dirty: amount of dirty space in bytes
460 * @tgc: trivial GC flag (1 => unmap after commit end)
461 * @cmt: commit flag (1 => reserved for commit)
462 */
463struct ubifs_lpt_lprops {
464 int free;
465 int dirty;
466 unsigned tgc : 1;
467 unsigned cmt : 1;
468};
469
470/**
471 * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
472 * @empty_lebs: number of empty LEBs
473 * @taken_empty_lebs: number of taken LEBs
474 * @idx_lebs: number of indexing LEBs
475 * @total_free: total free space in bytes
476 * @total_dirty: total dirty space in bytes
477 * @total_used: total used space in bytes (includes only data LEBs)
478 * @total_dead: total dead space in bytes (includes only data LEBs)
479 * @total_dark: total dark space in bytes (includes only data LEBs)
480 *
481 * N.B. total_dirty and total_used are different to other total_* fields,
482 * because they account _all_ LEBs, not just data LEBs.
483 *
484 * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
485 * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
486 * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
487 * by itself (in which case 'unused_lebs' would be a better name). In the case
488 * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
489 * but unlike other empty LEBs that are 'taken', it may not be written straight
490 * away (i.e. before the next commit start or unmount), so either gc_lnum must
491 * be specially accounted for, or the current approach followed i.e. count it
492 * under 'taken_empty_lebs'.
493 */
494struct ubifs_lp_stats {
495 int empty_lebs;
496 int taken_empty_lebs;
497 int idx_lebs;
498 long long total_free;
499 long long total_dirty;
500 long long total_used;
501 long long total_dead;
502 long long total_dark;
503};
504
505struct ubifs_nnode;
506
507/**
508 * struct ubifs_cnode - LEB Properties Tree common node.
509 * @parent: parent nnode
510 * @cnext: next cnode to commit
511 * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
512 * @iip: index in parent
513 * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
514 * @num: node number
515 */
516struct ubifs_cnode {
517 struct ubifs_nnode *parent;
518 struct ubifs_cnode *cnext;
519 unsigned long flags;
520 int iip;
521 int level;
522 int num;
523};
524
525/**
526 * struct ubifs_pnode - LEB Properties Tree leaf node.
527 * @parent: parent nnode
528 * @cnext: next cnode to commit
529 * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
530 * @iip: index in parent
531 * @level: level in the tree (always zero for pnodes)
532 * @num: node number
533 * @lprops: LEB properties array
534 */
535struct ubifs_pnode {
536 struct ubifs_nnode *parent;
537 struct ubifs_cnode *cnext;
538 unsigned long flags;
539 int iip;
540 int level;
541 int num;
542 struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
543};
544
545/**
546 * struct ubifs_nbranch - LEB Properties Tree internal node branch.
547 * @lnum: LEB number of child
548 * @offs: offset of child
549 * @nnode: nnode child
550 * @pnode: pnode child
551 * @cnode: cnode child
552 */
553struct ubifs_nbranch {
554 int lnum;
555 int offs;
556 union {
557 struct ubifs_nnode *nnode;
558 struct ubifs_pnode *pnode;
559 struct ubifs_cnode *cnode;
560 };
561};
562
563/**
564 * struct ubifs_nnode - LEB Properties Tree internal node.
565 * @parent: parent nnode
566 * @cnext: next cnode to commit
567 * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
568 * @iip: index in parent
569 * @level: level in the tree (always greater than zero for nnodes)
570 * @num: node number
571 * @nbranch: branches to child nodes
572 */
573struct ubifs_nnode {
574 struct ubifs_nnode *parent;
575 struct ubifs_cnode *cnext;
576 unsigned long flags;
577 int iip;
578 int level;
579 int num;
580 struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
581};
582
583/**
584 * struct ubifs_lpt_heap - heap of categorized lprops.
585 * @arr: heap array
586 * @cnt: number in heap
587 * @max_cnt: maximum number allowed in heap
588 *
589 * There are %LPROPS_HEAP_CNT heaps.
590 */
591struct ubifs_lpt_heap {
592 struct ubifs_lprops **arr;
593 int cnt;
594 int max_cnt;
595};
596
597/*
598 * Return codes for LPT scan callback function.
599 *
600 * LPT_SCAN_CONTINUE: continue scanning
601 * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
602 * LPT_SCAN_STOP: stop scanning
603 */
604enum {
605 LPT_SCAN_CONTINUE = 0,
606 LPT_SCAN_ADD = 1,
607 LPT_SCAN_STOP = 2,
608};
609
610struct ubifs_info;
611
612/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
613typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
614 const struct ubifs_lprops *lprops,
615 int in_tree, void *data);
616
617/**
618 * struct ubifs_wbuf - UBIFS write-buffer.
619 * @c: UBIFS file-system description object
620 * @buf: write-buffer (of min. flash I/O unit size)
621 * @lnum: logical eraseblock number the write-buffer points to
622 * @offs: write-buffer offset in this logical eraseblock
623 * @avail: number of bytes available in the write-buffer
624 * @used: number of used bytes in the write-buffer
625 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
626 * %UBI_UNKNOWN)
627 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
628 * up by 'mutex_lock_nested()).
629 * @sync_callback: write-buffer synchronization callback
630 * @io_mutex: serializes write-buffer I/O
631 * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
632 * fields
633 * @timer: write-buffer timer
634 * @timeout: timer expire interval in jiffies
635 * @need_sync: it is set if its timer expired and needs sync
636 * @next_ino: points to the next position of the following inode number
637 * @inodes: stores the inode numbers of the nodes which are in wbuf
638 *
639 * The write-buffer synchronization callback is called when the write-buffer is
640 * synchronized in order to notify how much space was wasted due to
641 * write-buffer padding and how much free space is left in the LEB.
642 *
643 * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
644 * spin-lock or mutex because they are written under both mutex and spin-lock.
645 * @buf is appended to under mutex but overwritten under both mutex and
646 * spin-lock. Thus the data between @buf and @buf + @used can be read under
647 * spinlock.
648 */
649struct ubifs_wbuf {
650 struct ubifs_info *c;
651 void *buf;
652 int lnum;
653 int offs;
654 int avail;
655 int used;
656 int dtype;
657 int jhead;
658 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
659 struct mutex io_mutex;
660 spinlock_t lock;
661 struct timer_list timer;
662 int timeout;
663 int need_sync;
664 int next_ino;
665 ino_t *inodes;
666};
667
668/**
669 * struct ubifs_bud - bud logical eraseblock.
670 * @lnum: logical eraseblock number
671 * @start: where the (uncommitted) bud data starts
672 * @jhead: journal head number this bud belongs to
673 * @list: link in the list buds belonging to the same journal head
674 * @rb: link in the tree of all buds
675 */
676struct ubifs_bud {
677 int lnum;
678 int start;
679 int jhead;
680 struct list_head list;
681 struct rb_node rb;
682};
683
684/**
685 * struct ubifs_jhead - journal head.
686 * @wbuf: head's write-buffer
687 * @buds_list: list of bud LEBs belonging to this journal head
688 *
689 * Note, the @buds list is protected by the @c->buds_lock.
690 */
691struct ubifs_jhead {
692 struct ubifs_wbuf wbuf;
693 struct list_head buds_list;
694};
695
696/**
697 * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
698 * @key: key
699 * @znode: znode address in memory
700 * @lnum: LEB number of the indexing node
701 * @offs: offset of the indexing node within @lnum
702 * @len: target node length
703 */
704struct ubifs_zbranch {
705 union ubifs_key key;
706 union {
707 struct ubifs_znode *znode;
708 void *leaf;
709 };
710 int lnum;
711 int offs;
712 int len;
713};
714
715/**
716 * struct ubifs_znode - in-memory representation of an indexing node.
717 * @parent: parent znode or NULL if it is the root
718 * @cnext: next znode to commit
719 * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
720 * @time: last access time (seconds)
721 * @level: level of the entry in the TNC tree
722 * @child_cnt: count of child znodes
723 * @iip: index in parent's zbranch array
724 * @alt: lower bound of key range has altered i.e. child inserted at slot 0
725 * @lnum: LEB number of the corresponding indexing node
726 * @offs: offset of the corresponding indexing node
727 * @len: length of the corresponding indexing node
728 * @zbranch: array of znode branches (@c->fanout elements)
729 */
730struct ubifs_znode {
731 struct ubifs_znode *parent;
732 struct ubifs_znode *cnext;
733 unsigned long flags;
734 unsigned long time;
735 int level;
736 int child_cnt;
737 int iip;
738 int alt;
739#ifdef CONFIG_UBIFS_FS_DEBUG
740 int lnum, offs, len;
741#endif
742 struct ubifs_zbranch zbranch[];
743};
744
745/**
746 * struct ubifs_node_range - node length range description data structure.
747 * @len: fixed node length
748 * @min_len: minimum possible node length
749 * @max_len: maximum possible node length
750 *
751 * If @max_len is %0, the node has fixed length @len.
752 */
753struct ubifs_node_range {
754 union {
755 int len;
756 int min_len;
757 };
758 int max_len;
759};
760
761/**
762 * struct ubifs_compressor - UBIFS compressor description structure.
763 * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
764 * @cc: cryptoapi compressor handle
765 * @comp_mutex: mutex used during compression
766 * @decomp_mutex: mutex used during decompression
767 * @name: compressor name
768 * @capi_name: cryptoapi compressor name
769 */
770struct ubifs_compressor {
771 int compr_type;
772 struct crypto_comp *cc;
773 struct mutex *comp_mutex;
774 struct mutex *decomp_mutex;
775 const char *name;
776 const char *capi_name;
777};
778
779/**
780 * struct ubifs_budget_req - budget requirements of an operation.
781 *
782 * @fast: non-zero if the budgeting should try to aquire budget quickly and
783 * should not try to call write-back
784 * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
785 * have to be re-calculated
786 * @new_page: non-zero if the operation adds a new page
787 * @dirtied_page: non-zero if the operation makes a page dirty
788 * @new_dent: non-zero if the operation adds a new directory entry
789 * @mod_dent: non-zero if the operation removes or modifies an existing
790 * directory entry
791 * @new_ino: non-zero if the operation adds a new inode
792 * @new_ino_d: now much data newly created inode contains
793 * @dirtied_ino: how many inodes the operation makes dirty
794 * @dirtied_ino_d: now much data dirtied inode contains
795 * @idx_growth: how much the index will supposedly grow
796 * @data_growth: how much new data the operation will supposedly add
797 * @dd_growth: how much data that makes other data dirty the operation will
798 * supposedly add
799 *
800 * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
801 * budgeting subsystem caches index and data growth values there to avoid
802 * re-calculating them when the budget is released. However, if @idx_growth is
803 * %-1, it is calculated by the release function using other fields.
804 *
805 * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
806 * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
807 * dirty by the re-name operation.
808 */
809struct ubifs_budget_req {
810 unsigned int fast:1;
811 unsigned int recalculate:1;
812 unsigned int new_page:1;
813 unsigned int dirtied_page:1;
814 unsigned int new_dent:1;
815 unsigned int mod_dent:1;
816 unsigned int new_ino:1;
817 unsigned int new_ino_d:13;
818#ifndef UBIFS_DEBUG
819 unsigned int dirtied_ino:4;
820 unsigned int dirtied_ino_d:15;
821#else
822 /* Not bit-fields to check for overflows */
823 unsigned int dirtied_ino;
824 unsigned int dirtied_ino_d;
825#endif
826 int idx_growth;
827 int data_growth;
828 int dd_growth;
829};
830
831/**
832 * struct ubifs_orphan - stores the inode number of an orphan.
833 * @rb: rb-tree node of rb-tree of orphans sorted by inode number
834 * @list: list head of list of orphans in order added
835 * @new_list: list head of list of orphans added since the last commit
836 * @cnext: next orphan to commit
837 * @dnext: next orphan to delete
838 * @inum: inode number
839 * @new: %1 => added since the last commit, otherwise %0
840 */
841struct ubifs_orphan {
842 struct rb_node rb;
843 struct list_head list;
844 struct list_head new_list;
845 struct ubifs_orphan *cnext;
846 struct ubifs_orphan *dnext;
847 ino_t inum;
848 int new;
849};
850
851/**
852 * struct ubifs_mount_opts - UBIFS-specific mount options information.
853 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
854 */
855struct ubifs_mount_opts {
856 unsigned int unmount_mode:2;
857};
858
859/**
860 * struct ubifs_info - UBIFS file-system description data structure
861 * (per-superblock).
862 * @vfs_sb: VFS @struct super_block object
863 * @bdi: backing device info object to make VFS happy and disable readahead
864 *
865 * @highest_inum: highest used inode number
866 * @vfs_gen: VFS inode generation counter
867 * @max_sqnum: current global sequence number
868 * @cmt_no: commit number (last successfully completed commit)
869 * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
870 * @fmt_version: UBIFS on-flash format version
871 * @uuid: UUID from super block
872 *
873 * @lhead_lnum: log head logical eraseblock number
874 * @lhead_offs: log head offset
875 * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
876 * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
877 * @bud_bytes
878 * @min_log_bytes: minimum required number of bytes in the log
879 * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
880 * committed buds
881 *
882 * @buds: tree of all buds indexed by bud LEB number
883 * @bud_bytes: how many bytes of flash is used by buds
884 * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
885 * lists
886 * @jhead_cnt: count of journal heads
887 * @jheads: journal heads (head zero is base head)
888 * @max_bud_bytes: maximum number of bytes allowed in buds
889 * @bg_bud_bytes: number of bud bytes when background commit is initiated
890 * @old_buds: buds to be released after commit ends
891 * @max_bud_cnt: maximum number of buds
892 *
893 * @commit_sem: synchronizes committer with other processes
894 * @cmt_state: commit state
895 * @cs_lock: commit state lock
896 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
897 * @fast_unmount: do not run journal commit before un-mounting
898 * @big_lpt: flag that LPT is too big to write whole during commit
899 * @check_lpt_free: flag that indicates LPT GC may be needed
900 * @nospace: non-zero if the file-system does not have flash space (used as
901 * optimization)
902 * @nospace_rp: the same as @nospace, but additionally means that even reserved
903 * pool is full
904 *
905 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
906 * @calc_idx_sz
907 * @zroot: zbranch which points to the root index node and znode
908 * @cnext: next znode to commit
909 * @enext: next znode to commit to empty space
910 * @gap_lebs: array of LEBs used by the in-gaps commit method
911 * @cbuf: commit buffer
912 * @ileb_buf: buffer for commit in-the-gaps method
913 * @ileb_len: length of data in ileb_buf
914 * @ihead_lnum: LEB number of index head
915 * @ihead_offs: offset of index head
916 * @ilebs: pre-allocated index LEBs
917 * @ileb_cnt: number of pre-allocated index LEBs
918 * @ileb_nxt: next pre-allocated index LEBs
919 * @old_idx: tree of index nodes obsoleted since the last commit start
920 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
921 * @new_ihead_lnum: used by debugging to check ihead_lnum
922 * @new_ihead_offs: used by debugging to check ihead_offs
923 *
924 * @mst_node: master node
925 * @mst_offs: offset of valid master node
926 * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
927 *
928 * @log_lebs: number of logical eraseblocks in the log
929 * @log_bytes: log size in bytes
930 * @log_last: last LEB of the log
931 * @lpt_lebs: number of LEBs used for lprops table
932 * @lpt_first: first LEB of the lprops table area
933 * @lpt_last: last LEB of the lprops table area
934 * @orph_lebs: number of LEBs used for the orphan area
935 * @orph_first: first LEB of the orphan area
936 * @orph_last: last LEB of the orphan area
937 * @main_lebs: count of LEBs in the main area
938 * @main_first: first LEB of the main area
939 * @main_bytes: main area size in bytes
940 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
941 *
942 * @key_hash_type: type of the key hash
943 * @key_hash: direntry key hash function
944 * @key_fmt: key format
945 * @key_len: key length
946 * @fanout: fanout of the index tree (number of links per indexing node)
947 *
948 * @min_io_size: minimal input/output unit size
949 * @min_io_shift: number of bits in @min_io_size minus one
950 * @leb_size: logical eraseblock size in bytes
951 * @half_leb_size: half LEB size
952 * @leb_cnt: count of logical eraseblocks
953 * @max_leb_cnt: maximum count of logical eraseblocks
954 * @old_leb_cnt: count of logical eraseblocks before re-size
955 * @ro_media: the underlying UBI volume is read-only
956 *
957 * @dirty_pg_cnt: number of dirty pages (not used)
958 * @dirty_zn_cnt: number of dirty znodes
959 * @clean_zn_cnt: number of clean znodes
960 *
961 * @budg_idx_growth: amount of bytes budgeted for index growth
962 * @budg_data_growth: amount of bytes budgeted for cached data
963 * @budg_dd_growth: amount of bytes budgeted for cached data that will make
964 * other data dirty
965 * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
966 * but which still have to be taken into account because
967 * the index has not been committed so far
968 * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
969 * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
970 * @min_idx_lebs: minimum number of LEBs required for the index
971 * @old_idx_sz: size of index on flash
972 * @calc_idx_sz: temporary variable which is used to calculate new index size
973 * (contains accurate new index size at end of TNC commit start)
974 * @lst: lprops statistics
975 *
976 * @page_budget: budget for a page
977 * @inode_budget: budget for an inode
978 * @dent_budget: budget for a directory entry
979 *
980 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
981 * I/O unit
982 * @mst_node_alsz: master node aligned size
983 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
984 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
985 * @max_inode_sz: maximum possible inode size in bytes
986 * @max_znode_sz: size of znode in bytes
987 * @dead_wm: LEB dead space watermark
988 * @dark_wm: LEB dark space watermark
989 * @block_cnt: count of 4KiB blocks on the FS
990 *
991 * @ranges: UBIFS node length ranges
992 * @ubi: UBI volume descriptor
993 * @di: UBI device information
994 * @vi: UBI volume information
995 *
996 * @orph_tree: rb-tree of orphan inode numbers
997 * @orph_list: list of orphan inode numbers in order added
998 * @orph_new: list of orphan inode numbers added since last commit
999 * @orph_cnext: next orphan to commit
1000 * @orph_dnext: next orphan to delete
1001 * @orphan_lock: lock for orph_tree and orph_new
1002 * @orph_buf: buffer for orphan nodes
1003 * @new_orphans: number of orphans since last commit
1004 * @cmt_orphans: number of orphans being committed
1005 * @tot_orphans: number of orphans in the rb_tree
1006 * @max_orphans: maximum number of orphans allowed
1007 * @ohead_lnum: orphan head LEB number
1008 * @ohead_offs: orphan head offset
1009 * @no_orphs: non-zero if there are no orphans
1010 *
1011 * @bgt: UBIFS background thread
1012 * @bgt_name: background thread name
1013 * @need_bgt: if background thread should run
1014 * @need_wbuf_sync: if write-buffers have to be synchronized
1015 *
1016 * @gc_lnum: LEB number used for garbage collection
1017 * @sbuf: a buffer of LEB size used by GC and replay for scanning
1018 * @idx_gc: list of index LEBs that have been garbage collected
1019 * @idx_gc_cnt: number of elements on the idx_gc list
1020 *
1021 * @infos_list: links all 'ubifs_info' objects
1022 * @umount_mutex: serializes shrinker and un-mount
1023 * @shrinker_run_no: shrinker run number
1024 *
1025 * @space_bits: number of bits needed to record free or dirty space
1026 * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
1027 * @lpt_offs_bits: number of bits needed to record an offset in the LPT
1028 * @lpt_spc_bits: number of bits needed to space in the LPT
1029 * @pcnt_bits: number of bits needed to record pnode or nnode number
1030 * @lnum_bits: number of bits needed to record LEB number
1031 * @nnode_sz: size of on-flash nnode
1032 * @pnode_sz: size of on-flash pnode
1033 * @ltab_sz: size of on-flash LPT lprops table
1034 * @lsave_sz: size of on-flash LPT save table
1035 * @pnode_cnt: number of pnodes
1036 * @nnode_cnt: number of nnodes
1037 * @lpt_hght: height of the LPT
1038 * @pnodes_have: number of pnodes in memory
1039 *
1040 * @lp_mutex: protects lprops table and all the other lprops-related fields
1041 * @lpt_lnum: LEB number of the root nnode of the LPT
1042 * @lpt_offs: offset of the root nnode of the LPT
1043 * @nhead_lnum: LEB number of LPT head
1044 * @nhead_offs: offset of LPT head
1045 * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
1046 * @dirty_nn_cnt: number of dirty nnodes
1047 * @dirty_pn_cnt: number of dirty pnodes
1048 * @lpt_sz: LPT size
1049 * @lpt_nod_buf: buffer for an on-flash nnode or pnode
1050 * @lpt_buf: buffer of LEB size used by LPT
1051 * @nroot: address in memory of the root nnode of the LPT
1052 * @lpt_cnext: next LPT node to commit
1053 * @lpt_heap: array of heaps of categorized lprops
1054 * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
1055 * previous commit start
1056 * @uncat_list: list of un-categorized LEBs
1057 * @empty_list: list of empty LEBs
1058 * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
1059 * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
1060 * @freeable_cnt: number of freeable LEBs in @freeable_list
1061 *
1062 * @ltab_lnum: LEB number of LPT's own lprops table
1063 * @ltab_offs: offset of LPT's own lprops table
1064 * @ltab: LPT's own lprops table
1065 * @ltab_cmt: LPT's own lprops table (commit copy)
1066 * @lsave_cnt: number of LEB numbers in LPT's save table
1067 * @lsave_lnum: LEB number of LPT's save table
1068 * @lsave_offs: offset of LPT's save table
1069 * @lsave: LPT's save table
1070 * @lscan_lnum: LEB number of last LPT scan
1071 *
1072 * @rp_size: size of the reserved pool in bytes
1073 * @report_rp_size: size of the reserved pool reported to user-space
1074 * @rp_uid: reserved pool user ID
1075 * @rp_gid: reserved pool group ID
1076 *
1077 * @empty: if the UBI device is empty
1078 * @replay_tree: temporary tree used during journal replay
1079 * @replay_list: temporary list used during journal replay
1080 * @replay_buds: list of buds to replay
1081 * @cs_sqnum: sequence number of first node in the log (commit start node)
1082 * @replay_sqnum: sequence number of node currently being replayed
1083 * @need_recovery: file-system needs recovery
1084 * @replaying: set to %1 during journal replay
1085 * @unclean_leb_list: LEBs to recover when mounting ro to rw
1086 * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
1087 * @size_tree: inode size information for recovery
1088 * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
1089 * @mount_opts: UBIFS-specific mount options
1090 *
1091 * @dbg_buf: a buffer of LEB size used for debugging purposes
1092 * @old_zroot: old index root - used by 'dbg_check_old_index()'
1093 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
1094 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
1095 * @failure_mode: failure mode for recovery testing
1096 * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
1097 * @fail_timeout: time in jiffies when delay of failure mode expires
1098 * @fail_cnt: current number of calls to failure mode I/O functions
1099 * @fail_cnt_max: number of calls by which to delay failure mode
1100 */
1101struct ubifs_info {
1102 struct super_block *vfs_sb;
1103 struct backing_dev_info bdi;
1104
1105 ino_t highest_inum;
1106 unsigned int vfs_gen;
1107 unsigned long long max_sqnum;
1108 unsigned long long cmt_no;
1109 spinlock_t cnt_lock;
1110 int fmt_version;
1111 unsigned char uuid[16];
1112
1113 int lhead_lnum;
1114 int lhead_offs;
1115 int ltail_lnum;
1116 struct mutex log_mutex;
1117 int min_log_bytes;
1118 long long cmt_bud_bytes;
1119
1120 struct rb_root buds;
1121 long long bud_bytes;
1122 spinlock_t buds_lock;
1123 int jhead_cnt;
1124 struct ubifs_jhead *jheads;
1125 long long max_bud_bytes;
1126 long long bg_bud_bytes;
1127 struct list_head old_buds;
1128 int max_bud_cnt;
1129
1130 struct rw_semaphore commit_sem;
1131 int cmt_state;
1132 spinlock_t cs_lock;
1133 wait_queue_head_t cmt_wq;
1134 unsigned int fast_unmount:1;
1135 unsigned int big_lpt:1;
1136 unsigned int check_lpt_free:1;
1137 unsigned int nospace:1;
1138 unsigned int nospace_rp:1;
1139
1140 struct mutex tnc_mutex;
1141 struct ubifs_zbranch zroot;
1142 struct ubifs_znode *cnext;
1143 struct ubifs_znode *enext;
1144 int *gap_lebs;
1145 void *cbuf;
1146 void *ileb_buf;
1147 int ileb_len;
1148 int ihead_lnum;
1149 int ihead_offs;
1150 int *ilebs;
1151 int ileb_cnt;
1152 int ileb_nxt;
1153 struct rb_root old_idx;
1154 int *bottom_up_buf;
1155#ifdef CONFIG_UBIFS_FS_DEBUG
1156 int new_ihead_lnum;
1157 int new_ihead_offs;
1158#endif
1159
1160 struct ubifs_mst_node *mst_node;
1161 int mst_offs;
1162 struct mutex mst_mutex;
1163
1164 int log_lebs;
1165 long long log_bytes;
1166 int log_last;
1167 int lpt_lebs;
1168 int lpt_first;
1169 int lpt_last;
1170 int orph_lebs;
1171 int orph_first;
1172 int orph_last;
1173 int main_lebs;
1174 int main_first;
1175 long long main_bytes;
1176 int default_compr;
1177
1178 uint8_t key_hash_type;
1179 uint32_t (*key_hash)(const char *str, int len);
1180 int key_fmt;
1181 int key_len;
1182 int fanout;
1183
1184 int min_io_size;
1185 int min_io_shift;
1186 int leb_size;
1187 int half_leb_size;
1188 int leb_cnt;
1189 int max_leb_cnt;
1190 int old_leb_cnt;
1191 int ro_media;
1192
1193 atomic_long_t dirty_pg_cnt;
1194 atomic_long_t dirty_zn_cnt;
1195 atomic_long_t clean_zn_cnt;
1196
1197 long long budg_idx_growth;
1198 long long budg_data_growth;
1199 long long budg_dd_growth;
1200 long long budg_uncommitted_idx;
1201 spinlock_t space_lock;
1202 int min_idx_lebs;
1203 unsigned long long old_idx_sz;
1204 unsigned long long calc_idx_sz;
1205 struct ubifs_lp_stats lst;
1206
1207 int page_budget;
1208 int inode_budget;
1209 int dent_budget;
1210
1211 int ref_node_alsz;
1212 int mst_node_alsz;
1213 int min_idx_node_sz;
1214 int max_idx_node_sz;
1215 long long max_inode_sz;
1216 int max_znode_sz;
1217 int dead_wm;
1218 int dark_wm;
1219 int block_cnt;
1220
1221 struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
1222 struct ubi_volume_desc *ubi;
1223 struct ubi_device_info di;
1224 struct ubi_volume_info vi;
1225
1226 struct rb_root orph_tree;
1227 struct list_head orph_list;
1228 struct list_head orph_new;
1229 struct ubifs_orphan *orph_cnext;
1230 struct ubifs_orphan *orph_dnext;
1231 spinlock_t orphan_lock;
1232 void *orph_buf;
1233 int new_orphans;
1234 int cmt_orphans;
1235 int tot_orphans;
1236 int max_orphans;
1237 int ohead_lnum;
1238 int ohead_offs;
1239 int no_orphs;
1240
1241 struct task_struct *bgt;
1242 char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
1243 int need_bgt;
1244 int need_wbuf_sync;
1245
1246 int gc_lnum;
1247 void *sbuf;
1248 struct list_head idx_gc;
1249 int idx_gc_cnt;
1250
1251 struct list_head infos_list;
1252 struct mutex umount_mutex;
1253 unsigned int shrinker_run_no;
1254
1255 int space_bits;
1256 int lpt_lnum_bits;
1257 int lpt_offs_bits;
1258 int lpt_spc_bits;
1259 int pcnt_bits;
1260 int lnum_bits;
1261 int nnode_sz;
1262 int pnode_sz;
1263 int ltab_sz;
1264 int lsave_sz;
1265 int pnode_cnt;
1266 int nnode_cnt;
1267 int lpt_hght;
1268 int pnodes_have;
1269
1270 struct mutex lp_mutex;
1271 int lpt_lnum;
1272 int lpt_offs;
1273 int nhead_lnum;
1274 int nhead_offs;
1275 int lpt_drty_flgs;
1276 int dirty_nn_cnt;
1277 int dirty_pn_cnt;
1278 long long lpt_sz;
1279 void *lpt_nod_buf;
1280 void *lpt_buf;
1281 struct ubifs_nnode *nroot;
1282 struct ubifs_cnode *lpt_cnext;
1283 struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
1284 struct ubifs_lpt_heap dirty_idx;
1285 struct list_head uncat_list;
1286 struct list_head empty_list;
1287 struct list_head freeable_list;
1288 struct list_head frdi_idx_list;
1289 int freeable_cnt;
1290
1291 int ltab_lnum;
1292 int ltab_offs;
1293 struct ubifs_lpt_lprops *ltab;
1294 struct ubifs_lpt_lprops *ltab_cmt;
1295 int lsave_cnt;
1296 int lsave_lnum;
1297 int lsave_offs;
1298 int *lsave;
1299 int lscan_lnum;
1300
1301 long long rp_size;
1302 long long report_rp_size;
1303 uid_t rp_uid;
1304 gid_t rp_gid;
1305
1306 /* The below fields are used only during mounting and re-mounting */
1307 int empty;
1308 struct rb_root replay_tree;
1309 struct list_head replay_list;
1310 struct list_head replay_buds;
1311 unsigned long long cs_sqnum;
1312 unsigned long long replay_sqnum;
1313 int need_recovery;
1314 int replaying;
1315 struct list_head unclean_leb_list;
1316 struct ubifs_mst_node *rcvrd_mst_node;
1317 struct rb_root size_tree;
1318 int remounting_rw;
1319 struct ubifs_mount_opts mount_opts;
1320
1321#ifdef CONFIG_UBIFS_FS_DEBUG
1322 void *dbg_buf;
1323 struct ubifs_zbranch old_zroot;
1324 int old_zroot_level;
1325 unsigned long long old_zroot_sqnum;
1326 int failure_mode;
1327 int fail_delay;
1328 unsigned long fail_timeout;
1329 unsigned int fail_cnt;
1330 unsigned int fail_cnt_max;
1331#endif
1332};
1333
1334extern struct list_head ubifs_infos;
1335extern spinlock_t ubifs_infos_lock;
1336extern atomic_long_t ubifs_clean_zn_cnt;
1337extern struct kmem_cache *ubifs_inode_slab;
1338extern struct super_operations ubifs_super_operations;
1339extern struct address_space_operations ubifs_file_address_operations;
1340extern struct file_operations ubifs_file_operations;
1341extern struct inode_operations ubifs_file_inode_operations;
1342extern struct file_operations ubifs_dir_operations;
1343extern struct inode_operations ubifs_dir_inode_operations;
1344extern struct inode_operations ubifs_symlink_inode_operations;
1345extern struct backing_dev_info ubifs_backing_dev_info;
1346extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
1347
1348/* io.c */
1349int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
1350int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
1351 int dtype);
1352int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
1353int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
1354 int lnum, int offs);
1355int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
1356 int lnum, int offs);
1357int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
1358 int offs, int dtype);
1359int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
1360 int offs, int quiet);
1361void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
1362void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
1363int ubifs_io_init(struct ubifs_info *c);
1364void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
1365int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
1366int ubifs_bg_wbufs_sync(struct ubifs_info *c);
1367void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
1368int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
1369
1370/* scan.c */
1371struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
1372 int offs, void *sbuf);
1373void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
1374int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
1375 int offs, int quiet);
1376struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
1377 int offs, void *sbuf);
1378void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
1379 int lnum, int offs);
1380int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
1381 void *buf, int offs);
1382void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
1383 void *buf);
1384
1385/* log.c */
1386void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
1387void ubifs_create_buds_lists(struct ubifs_info *c);
1388int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
1389struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
1390struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
1391int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
1392int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
1393int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
1394int ubifs_consolidate_log(struct ubifs_info *c);
1395
1396/* journal.c */
1397int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
1398 const struct qstr *nm, const struct inode *inode,
1399 int deletion, int xent);
1400int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
1401 const union ubifs_key *key, const void *buf, int len);
1402int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
1403 int last_reference);
1404int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
1405 const struct dentry *old_dentry,
1406 const struct inode *new_dir,
1407 const struct dentry *new_dentry, int sync);
1408int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1409 loff_t old_size, loff_t new_size);
1410int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
1411 const struct inode *inode, const struct qstr *nm);
1412int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
1413 const struct inode *inode2);
1414
1415/* budget.c */
1416int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
1417void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
1418void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
1419 struct ubifs_inode *ui);
1420int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
1421 struct ubifs_budget_req *req);
1422void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1423 struct ubifs_budget_req *req);
1424void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1425 struct ubifs_budget_req *req);
1426long long ubifs_budg_get_free_space(struct ubifs_info *c);
1427int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1428void ubifs_convert_page_budget(struct ubifs_info *c);
1429long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1430
1431/* find.c */
1432int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
1433 int squeeze);
1434int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
1435int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
1436 int min_space, int pick_free);
1437int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
1438int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
1439
1440/* tnc.c */
1441int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1442 struct ubifs_znode **zn, int *n);
1443int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
1444 void *node);
1445int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1446 void *node, const struct qstr *nm);
1447int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1448 void *node, int *lnum, int *offs);
1449int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
1450 int offs, int len);
1451int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
1452 int old_lnum, int old_offs, int lnum, int offs, int len);
1453int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
1454 int lnum, int offs, int len, const struct qstr *nm);
1455int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
1456int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
1457 const struct qstr *nm);
1458int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
1459 union ubifs_key *to_key);
1460int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
1461struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
1462 union ubifs_key *key,
1463 const struct qstr *nm);
1464void ubifs_tnc_close(struct ubifs_info *c);
1465int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
1466 int lnum, int offs, int is_idx);
1467int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
1468 int lnum, int offs);
1469/* Shared by tnc.c for tnc_commit.c */
1470void destroy_old_idx(struct ubifs_info *c);
1471int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
1472 int lnum, int offs);
1473int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
1474
1475/* tnc_misc.c */
1476struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
1477 struct ubifs_znode *znode);
1478int ubifs_search_zbranch(const struct ubifs_info *c,
1479 const struct ubifs_znode *znode,
1480 const union ubifs_key *key, int *n);
1481struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
1482struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
1483long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
1484struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
1485 struct ubifs_zbranch *zbr,
1486 struct ubifs_znode *parent, int iip);
1487int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
1488 void *node);
1489
1490/* tnc_commit.c */
1491int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
1492int ubifs_tnc_end_commit(struct ubifs_info *c);
1493
1494/* shrinker.c */
1495int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
1496
1497/* commit.c */
1498int ubifs_bg_thread(void *info);
1499void ubifs_commit_required(struct ubifs_info *c);
1500void ubifs_request_bg_commit(struct ubifs_info *c);
1501int ubifs_run_commit(struct ubifs_info *c);
1502void ubifs_recovery_commit(struct ubifs_info *c);
1503int ubifs_gc_should_commit(struct ubifs_info *c);
1504void ubifs_wait_for_commit(struct ubifs_info *c);
1505
1506/* master.c */
1507int ubifs_read_master(struct ubifs_info *c);
1508int ubifs_write_master(struct ubifs_info *c);
1509
1510/* sb.c */
1511int ubifs_read_superblock(struct ubifs_info *c);
1512struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
1513int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
1514
1515/* replay.c */
1516int ubifs_validate_entry(struct ubifs_info *c,
1517 const struct ubifs_dent_node *dent);
1518int ubifs_replay_journal(struct ubifs_info *c);
1519
1520/* gc.c */
1521int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
1522int ubifs_gc_start_commit(struct ubifs_info *c);
1523int ubifs_gc_end_commit(struct ubifs_info *c);
1524void ubifs_destroy_idx_gc(struct ubifs_info *c);
1525int ubifs_get_idx_gc_leb(struct ubifs_info *c);
1526int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
1527
1528/* orphan.c */
1529int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
1530void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
1531int ubifs_orphan_start_commit(struct ubifs_info *c);
1532int ubifs_orphan_end_commit(struct ubifs_info *c);
1533int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
1534
1535/* lpt.c */
1536int ubifs_calc_lpt_geom(struct ubifs_info *c);
1537int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
1538 int *lpt_lebs, int *big_lpt);
1539int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
1540struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
1541struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
1542int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
1543 ubifs_lpt_scan_callback scan_cb, void *data);
1544
1545/* Shared by lpt.c for lpt_commit.c */
1546void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
1547void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
1548 struct ubifs_lpt_lprops *ltab);
1549void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
1550 struct ubifs_pnode *pnode);
1551void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
1552 struct ubifs_nnode *nnode);
1553struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
1554 struct ubifs_nnode *parent, int iip);
1555struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
1556 struct ubifs_nnode *parent, int iip);
1557int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
1558void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
1559void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
1560uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
1561struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
1562
1563/* lpt_commit.c */
1564int ubifs_lpt_start_commit(struct ubifs_info *c);
1565int ubifs_lpt_end_commit(struct ubifs_info *c);
1566int ubifs_lpt_post_commit(struct ubifs_info *c);
1567void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
1568
1569/* lprops.c */
1570void ubifs_get_lprops(struct ubifs_info *c);
1571const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
1572 const struct ubifs_lprops *lp,
1573 int free, int dirty, int flags,
1574 int idx_gc_cnt);
1575void ubifs_release_lprops(struct ubifs_info *c);
1576void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
1577void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
1578 int cat);
1579void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
1580 struct ubifs_lprops *new_lprops);
1581void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
1582int ubifs_categorize_lprops(const struct ubifs_info *c,
1583 const struct ubifs_lprops *lprops);
1584int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
1585 int flags_set, int flags_clean, int idx_gc_cnt);
1586int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
1587 int flags_set, int flags_clean);
1588int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
1589const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
1590const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
1591const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
1592const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1593
1594/* file.c */
1595int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
1596int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
1597
1598/* dir.c */
1599struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
1600 int mode);
1601int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1602 struct kstat *stat);
1603
1604/* xattr.c */
1605int ubifs_setxattr(struct dentry *dentry, const char *name,
1606 const void *value, size_t size, int flags);
1607ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
1608 size_t size);
1609ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
1610int ubifs_removexattr(struct dentry *dentry, const char *name);
1611
1612/* super.c */
1613struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
1614
1615/* recovery.c */
1616int ubifs_recover_master_node(struct ubifs_info *c);
1617int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
1618struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
1619 int offs, void *sbuf, int grouped);
1620struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
1621 int offs, void *sbuf);
1622int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
1623int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
1624int ubifs_rcvry_gc_commit(struct ubifs_info *c);
1625int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
1626 int deletion, loff_t new_size);
1627int ubifs_recover_size(struct ubifs_info *c);
1628void ubifs_destroy_size_tree(struct ubifs_info *c);
1629
1630/* ioctl.c */
1631long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1632void ubifs_set_inode_flags(struct inode *inode);
1633#ifdef CONFIG_COMPAT
1634long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1635#endif
1636
1637/* compressor.c */
1638int __init ubifs_compressors_init(void);
1639void __exit ubifs_compressors_exit(void);
1640void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
1641 int *compr_type);
1642int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
1643 int compr_type);
1644
1645#include "debug.h"
1646#include "misc.h"
1647#include "key.h"
1648
1649#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
new file mode 100644
index 000000000000..1388a078e1a9
--- /dev/null
+++ b/fs/ubifs/xattr.c
@@ -0,0 +1,581 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS extended attributes support.
25 *
26 * Extended attributes are implemented as regular inodes with attached data,
27 * which limits extended attribute size to UBIFS block size (4KiB). Names of
28 * extended attributes are described by extended attribute entries (xentries),
29 * which are almost identical to directory entries, but have different key type.
30 *
31 * In other words, the situation with extended attributes is very similar to
32 * directories. Indeed, any inode (but of course not xattr inodes) may have a
33 * number of associated xentries, just like directory inodes have associated
34 * directory entries. Extended attribute entries store the name of the extended
35 * attribute, the host inode number, and the extended attribute inode number.
36 * Similarly, direntries store the name, the parent and the target inode
37 * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
38 * extended attributes.
39 *
40 * The number of extended attributes is not limited, but there is Linux
41 * limitation on the maximum possible size of the list of all extended
42 * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
43 * the sum of all extended attribute names of the inode does not exceed that
44 * limit.
45 *
46 * Extended attributes are synchronous, which means they are written to the
47 * flash media synchronously and there is no write-back for extended attribute
48 * inodes. The extended attribute values are not stored in compressed form on
49 * the media.
50 *
51 * Since extended attributes are represented by regular inodes, they are cached
52 * in the VFS inode cache. The xentries are cached in the LNC cache (see
53 * tnc.c).
54 *
55 * ACL support is not implemented.
56 */
57
58#include <linux/xattr.h>
59#include <linux/posix_acl_xattr.h>
60#include "ubifs.h"
61
62/*
63 * Limit the number of extended attributes per inode so that the total size
64 * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
65 */
66#define MAX_XATTRS_PER_INODE 65535
67
68/*
69 * Extended attribute type constants.
70 *
71 * USER_XATTR: user extended attribute ("user.*")
72 * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
73 * SECURITY_XATTR: security extended attribute ("security.*")
74 */
75enum {
76 USER_XATTR,
77 TRUSTED_XATTR,
78 SECURITY_XATTR,
79};
80
81static struct inode_operations none_inode_operations;
82static struct address_space_operations none_address_operations;
83static struct file_operations none_file_operations;
84
85/**
86 * create_xattr - create an extended attribute.
87 * @c: UBIFS file-system description object
88 * @host: host inode
89 * @nm: extended attribute name
90 * @value: extended attribute value
91 * @size: size of extended attribute value
92 *
93 * This is a helper function which creates an extended attribute of name @nm
94 * and value @value for inode @host. The host inode is also updated on flash
95 * because the ctime and extended attribute accounting data changes. This
96 * function returns zero in case of success and a negative error code in case
97 * of failure.
98 */
99static int create_xattr(struct ubifs_info *c, struct inode *host,
100 const struct qstr *nm, const void *value, int size)
101{
102 int err;
103 struct inode *inode;
104 struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
105 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
106 .new_ino_d = size, .dirtied_ino = 1,
107 .dirtied_ino_d = host_ui->data_len};
108
109 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
110 return -ENOSPC;
111 /*
112 * Linux limits the maximum size of the extended attribute names list
113 * to %XATTR_LIST_MAX. This means we should not allow creating more*
114 * extended attributes if the name list becomes larger. This limitation
115 * is artificial for UBIFS, though.
116 */
117 if (host_ui->xattr_names + host_ui->xattr_cnt +
118 nm->len + 1 > XATTR_LIST_MAX)
119 return -ENOSPC;
120
121 err = ubifs_budget_space(c, &req);
122 if (err)
123 return err;
124
125 inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
126 if (IS_ERR(inode)) {
127 err = PTR_ERR(inode);
128 goto out_budg;
129 }
130
131 mutex_lock(&host_ui->ui_mutex);
132 /* Re-define all operations to be "nothing" */
133 inode->i_mapping->a_ops = &none_address_operations;
134 inode->i_op = &none_inode_operations;
135 inode->i_fop = &none_file_operations;
136
137 inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
138 ui = ubifs_inode(inode);
139 ui->xattr = 1;
140 ui->flags |= UBIFS_XATTR_FL;
141 ui->data = kmalloc(size, GFP_NOFS);
142 if (!ui->data) {
143 err = -ENOMEM;
144 goto out_unlock;
145 }
146
147 memcpy(ui->data, value, size);
148 host->i_ctime = ubifs_current_time(host);
149 host_ui->xattr_cnt += 1;
150 host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
151 host_ui->xattr_size += CALC_XATTR_BYTES(size);
152 host_ui->xattr_names += nm->len;
153
154 /*
155 * We do not use i_size_write() because nobody can race with us as we
156 * are holding host @host->i_mutex - every xattr operation for this
157 * inode is serialized by it.
158 */
159 inode->i_size = ui->ui_size = size;
160 ui->data_len = size;
161 err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
162 if (err)
163 goto out_cancel;
164 mutex_unlock(&host_ui->ui_mutex);
165
166 ubifs_release_budget(c, &req);
167 insert_inode_hash(inode);
168 iput(inode);
169 return 0;
170
171out_cancel:
172 host_ui->xattr_cnt -= 1;
173 host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
174 host_ui->xattr_size -= CALC_XATTR_BYTES(size);
175out_unlock:
176 mutex_unlock(&host_ui->ui_mutex);
177 make_bad_inode(inode);
178 iput(inode);
179out_budg:
180 ubifs_release_budget(c, &req);
181 return err;
182}
183
184/**
185 * change_xattr - change an extended attribute.
186 * @c: UBIFS file-system description object
187 * @host: host inode
188 * @inode: extended attribute inode
189 * @value: extended attribute value
190 * @size: size of extended attribute value
191 *
192 * This helper function changes the value of extended attribute @inode with new
193 * data from @value. Returns zero in case of success and a negative error code
194 * in case of failure.
195 */
196static int change_xattr(struct ubifs_info *c, struct inode *host,
197 struct inode *inode, const void *value, int size)
198{
199 int err;
200 struct ubifs_inode *host_ui = ubifs_inode(host);
201 struct ubifs_inode *ui = ubifs_inode(inode);
202 struct ubifs_budget_req req = { .dirtied_ino = 2,
203 .dirtied_ino_d = size + host_ui->data_len };
204
205 ubifs_assert(ui->data_len == inode->i_size);
206 err = ubifs_budget_space(c, &req);
207 if (err)
208 return err;
209
210 mutex_lock(&host_ui->ui_mutex);
211 host->i_ctime = ubifs_current_time(host);
212 host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
213 host_ui->xattr_size += CALC_XATTR_BYTES(size);
214
215 kfree(ui->data);
216 ui->data = kmalloc(size, GFP_NOFS);
217 if (!ui->data) {
218 err = -ENOMEM;
219 goto out_unlock;
220 }
221
222 memcpy(ui->data, value, size);
223 inode->i_size = ui->ui_size = size;
224 ui->data_len = size;
225
226 /*
227 * It is important to write the host inode after the xattr inode
228 * because if the host inode gets synchronized (via 'fsync()'), then
229 * the extended attribute inode gets synchronized, because it goes
230 * before the host inode in the write-buffer.
231 */
232 err = ubifs_jnl_change_xattr(c, inode, host);
233 if (err)
234 goto out_cancel;
235 mutex_unlock(&host_ui->ui_mutex);
236
237 ubifs_release_budget(c, &req);
238 return 0;
239
240out_cancel:
241 host_ui->xattr_size -= CALC_XATTR_BYTES(size);
242 host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
243 make_bad_inode(inode);
244out_unlock:
245 mutex_unlock(&host_ui->ui_mutex);
246 ubifs_release_budget(c, &req);
247 return err;
248}
249
250/**
251 * check_namespace - check extended attribute name-space.
252 * @nm: extended attribute name
253 *
254 * This function makes sure the extended attribute name belongs to one of the
255 * supported extended attribute name-spaces. Returns name-space index in case
256 * of success and a negative error code in case of failure.
257 */
258static int check_namespace(const struct qstr *nm)
259{
260 int type;
261
262 if (nm->len > UBIFS_MAX_NLEN)
263 return -ENAMETOOLONG;
264
265 if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
266 XATTR_TRUSTED_PREFIX_LEN)) {
267 if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
268 return -EINVAL;
269 type = TRUSTED_XATTR;
270 } else if (!strncmp(nm->name, XATTR_USER_PREFIX,
271 XATTR_USER_PREFIX_LEN)) {
272 if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
273 return -EINVAL;
274 type = USER_XATTR;
275 } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
276 XATTR_SECURITY_PREFIX_LEN)) {
277 if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
278 return -EINVAL;
279 type = SECURITY_XATTR;
280 } else
281 return -EOPNOTSUPP;
282
283 return type;
284}
285
286static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
287{
288 struct inode *inode;
289
290 inode = ubifs_iget(c->vfs_sb, inum);
291 if (IS_ERR(inode)) {
292 ubifs_err("dead extended attribute entry, error %d",
293 (int)PTR_ERR(inode));
294 return inode;
295 }
296 if (ubifs_inode(inode)->xattr)
297 return inode;
298 ubifs_err("corrupt extended attribute entry");
299 iput(inode);
300 return ERR_PTR(-EINVAL);
301}
302
303int ubifs_setxattr(struct dentry *dentry, const char *name,
304 const void *value, size_t size, int flags)
305{
306 struct inode *inode, *host = dentry->d_inode;
307 struct ubifs_info *c = host->i_sb->s_fs_info;
308 struct qstr nm = { .name = name, .len = strlen(name) };
309 struct ubifs_dent_node *xent;
310 union ubifs_key key;
311 int err, type;
312
313 dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
314 host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
315
316 if (size > UBIFS_MAX_INO_DATA)
317 return -ERANGE;
318
319 type = check_namespace(&nm);
320 if (type < 0)
321 return type;
322
323 xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
324 if (!xent)
325 return -ENOMEM;
326
327 /*
328 * The extended attribute entries are stored in LNC, so multiple
329 * look-ups do not involve reading the flash.
330 */
331 xent_key_init(c, &key, host->i_ino, &nm);
332 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
333 if (err) {
334 if (err != -ENOENT)
335 goto out_free;
336
337 if (flags & XATTR_REPLACE)
338 /* We are asked not to create the xattr */
339 err = -ENODATA;
340 else
341 err = create_xattr(c, host, &nm, value, size);
342 goto out_free;
343 }
344
345 if (flags & XATTR_CREATE) {
346 /* We are asked not to replace the xattr */
347 err = -EEXIST;
348 goto out_free;
349 }
350
351 inode = iget_xattr(c, le64_to_cpu(xent->inum));
352 if (IS_ERR(inode)) {
353 err = PTR_ERR(inode);
354 goto out_free;
355 }
356
357 err = change_xattr(c, host, inode, value, size);
358 iput(inode);
359
360out_free:
361 kfree(xent);
362 return err;
363}
364
365ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
366 size_t size)
367{
368 struct inode *inode, *host = dentry->d_inode;
369 struct ubifs_info *c = host->i_sb->s_fs_info;
370 struct qstr nm = { .name = name, .len = strlen(name) };
371 struct ubifs_inode *ui;
372 struct ubifs_dent_node *xent;
373 union ubifs_key key;
374 int err;
375
376 dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
377 host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
378
379 err = check_namespace(&nm);
380 if (err < 0)
381 return err;
382
383 xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
384 if (!xent)
385 return -ENOMEM;
386
387 mutex_lock(&host->i_mutex);
388 xent_key_init(c, &key, host->i_ino, &nm);
389 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
390 if (err) {
391 if (err == -ENOENT)
392 err = -ENODATA;
393 goto out_unlock;
394 }
395
396 inode = iget_xattr(c, le64_to_cpu(xent->inum));
397 if (IS_ERR(inode)) {
398 err = PTR_ERR(inode);
399 goto out_unlock;
400 }
401
402 ui = ubifs_inode(inode);
403 ubifs_assert(inode->i_size == ui->data_len);
404 ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
405
406 if (buf) {
407 /* If @buf is %NULL we are supposed to return the length */
408 if (ui->data_len > size) {
409 dbg_err("buffer size %zd, xattr len %d",
410 size, ui->data_len);
411 err = -ERANGE;
412 goto out_iput;
413 }
414
415 memcpy(buf, ui->data, ui->data_len);
416 }
417 err = ui->data_len;
418
419out_iput:
420 iput(inode);
421out_unlock:
422 mutex_unlock(&host->i_mutex);
423 kfree(xent);
424 return err;
425}
426
427ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
428{
429 union ubifs_key key;
430 struct inode *host = dentry->d_inode;
431 struct ubifs_info *c = host->i_sb->s_fs_info;
432 struct ubifs_inode *host_ui = ubifs_inode(host);
433 struct ubifs_dent_node *xent, *pxent = NULL;
434 int err, len, written = 0;
435 struct qstr nm = { .name = NULL };
436
437 dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
438 dentry->d_name.len, dentry->d_name.name, size);
439
440 len = host_ui->xattr_names + host_ui->xattr_cnt;
441 if (!buffer)
442 /*
443 * We should return the minimum buffer size which will fit a
444 * null-terminated list of all the extended attribute names.
445 */
446 return len;
447
448 if (len > size)
449 return -ERANGE;
450
451 lowest_xent_key(c, &key, host->i_ino);
452
453 mutex_lock(&host->i_mutex);
454 while (1) {
455 int type;
456
457 xent = ubifs_tnc_next_ent(c, &key, &nm);
458 if (unlikely(IS_ERR(xent))) {
459 err = PTR_ERR(xent);
460 break;
461 }
462
463 nm.name = xent->name;
464 nm.len = le16_to_cpu(xent->nlen);
465
466 type = check_namespace(&nm);
467 if (unlikely(type < 0)) {
468 err = type;
469 break;
470 }
471
472 /* Show trusted namespace only for "power" users */
473 if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
474 memcpy(buffer + written, nm.name, nm.len + 1);
475 written += nm.len + 1;
476 }
477
478 kfree(pxent);
479 pxent = xent;
480 key_read(c, &xent->key, &key);
481 }
482 mutex_unlock(&host->i_mutex);
483
484 kfree(pxent);
485 if (err != -ENOENT) {
486 ubifs_err("cannot find next direntry, error %d", err);
487 return err;
488 }
489
490 ubifs_assert(written <= size);
491 return written;
492}
493
494static int remove_xattr(struct ubifs_info *c, struct inode *host,
495 struct inode *inode, const struct qstr *nm)
496{
497 int err;
498 struct ubifs_inode *host_ui = ubifs_inode(host);
499 struct ubifs_inode *ui = ubifs_inode(inode);
500 struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
501 .dirtied_ino_d = host_ui->data_len };
502
503 ubifs_assert(ui->data_len == inode->i_size);
504
505 err = ubifs_budget_space(c, &req);
506 if (err)
507 return err;
508
509 mutex_lock(&host_ui->ui_mutex);
510 host->i_ctime = ubifs_current_time(host);
511 host_ui->xattr_cnt -= 1;
512 host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
513 host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
514 host_ui->xattr_names -= nm->len;
515
516 err = ubifs_jnl_delete_xattr(c, host, inode, nm);
517 if (err)
518 goto out_cancel;
519 mutex_unlock(&host_ui->ui_mutex);
520
521 ubifs_release_budget(c, &req);
522 return 0;
523
524out_cancel:
525 host_ui->xattr_cnt += 1;
526 host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
527 host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
528 mutex_unlock(&host_ui->ui_mutex);
529 ubifs_release_budget(c, &req);
530 make_bad_inode(inode);
531 return err;
532}
533
534int ubifs_removexattr(struct dentry *dentry, const char *name)
535{
536 struct inode *inode, *host = dentry->d_inode;
537 struct ubifs_info *c = host->i_sb->s_fs_info;
538 struct qstr nm = { .name = name, .len = strlen(name) };
539 struct ubifs_dent_node *xent;
540 union ubifs_key key;
541 int err;
542
543 dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
544 host->i_ino, dentry->d_name.len, dentry->d_name.name);
545 ubifs_assert(mutex_is_locked(&host->i_mutex));
546
547 err = check_namespace(&nm);
548 if (err < 0)
549 return err;
550
551 xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
552 if (!xent)
553 return -ENOMEM;
554
555 xent_key_init(c, &key, host->i_ino, &nm);
556 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
557 if (err) {
558 if (err == -ENOENT)
559 err = -ENODATA;
560 goto out_free;
561 }
562
563 inode = iget_xattr(c, le64_to_cpu(xent->inum));
564 if (IS_ERR(inode)) {
565 err = PTR_ERR(inode);
566 goto out_free;
567 }
568
569 ubifs_assert(inode->i_nlink == 1);
570 inode->i_nlink = 0;
571 err = remove_xattr(c, host, inode, &nm);
572 if (err)
573 inode->i_nlink = 1;
574
575 /* If @i_nlink is 0, 'iput()' will delete the inode */
576 iput(inode);
577
578out_free:
579 kfree(xent);
580 return err;
581}